Removed clang-tidy check for do-while loops

Added 2 new constructors for FileTape which copy the size of existing tape into a new one
Implemented external sort of a Tape using heap sort
Rewritten FileTape constructors implementation to better take advantage of existing constructor calls
Added const to some parameters to indicate their immutability
Added unit tests for external_sort
This commit is contained in:
Egor 2024-10-27 05:18:23 +02:00
parent c2f46935dc
commit 33fee958a5
11 changed files with 274 additions and 28 deletions

View file

@ -1,4 +1,4 @@
Checks: 'clang-diagnostic-*,clang-analyzer-*,cppcoreguidelines-*,modernize-*,-modernize-use-trailing-return-type,readability-*' Checks: 'clang-diagnostic-*,clang-analyzer-*,cppcoreguidelines-*,modernize-*,-modernize-use-trailing-return-type,readability-*,-cppcoreguidelines-avoid-do-while'
CheckOptions: CheckOptions:
- { key: readability-identifier-naming.NamespaceCase, value: lower_case } - { key: readability-identifier-naming.NamespaceCase, value: lower_case }
- { key: readability-identifier-naming.ClassCase, value: CamelCase } - { key: readability-identifier-naming.ClassCase, value: CamelCase }

View file

@ -4,6 +4,7 @@ project(yadro-task VERSION 0.1 LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD 20)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_BUILD_TYPE Debug)
# clang-tidy # clang-tidy
find_program(CLANG_TIDY_EXE NAMES clang-tidy REQUIRED) find_program(CLANG_TIDY_EXE NAMES clang-tidy REQUIRED)

View file

@ -1,5 +1,4 @@
#include <iostream> #include <iostream>
#include <ostream>
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
std::cout << "Hello, World!" << std::endl; std::cout << "Hello, World!" << std::endl;

View file

@ -41,7 +41,7 @@ const static int FT_SEEK_OFFSET = FT_CELL_SIZE + 1;
* configuration file. Each cell is represented as a plain text 32-bit unsigned * configuration file. Each cell is represented as a plain text 32-bit unsigned
* int, padded to FT_CELL_SIZE with leading zeroes, separated by FT_DELIMETER. * int, padded to FT_CELL_SIZE with leading zeroes, separated by FT_DELIMETER.
* Each cell MUST be FT_CELL_SIZE long to correctly implement write operation * Each cell MUST be FT_CELL_SIZE long to correctly implement write operation
* without re-writing the entire file. FT_DELIMETER * without re-writing the entire file.
* *
* Example of a valid file for a FileTape * Example of a valid file for a FileTape
* - example.txt * - example.txt
@ -95,6 +95,24 @@ class FileTape : public Tape {
FileTape(size_t cells, std::string file_name, FileTape(size_t cells, std::string file_name,
FileTapeSettings settings = FT_DEFAULT_SETTINGS); FileTapeSettings settings = FT_DEFAULT_SETTINGS);
/**
* Initializes a new instance of FileTape that will create and open a file
* with the specified name and fill it with zeroed cells - the amount of
* cells will be the same as the specified FileTape.
* File must not exist and directory must be write accessible.
*
* This is necessary in the situations when the FileTapse size is a bigger
* than size_t max value. Seeking will not introduce delay during this
* operation.
*
* @param same_size_as FileTape, the size of which will be carried over to
* the new FileTape
* @param file_name Name of a file to be created and opened
* @param settings FileTape settings
*/
FileTape(FileTape &same_size_as, std::string file_name,
FileTapeSettings = FT_DEFAULT_SETTINGS);
/** /**
* Initializes a new instance of FileTape that will create and open a * Initializes a new instance of FileTape that will create and open a
* temporary file and fill it with *cells* amount of zeroed cells. * temporary file and fill it with *cells* amount of zeroed cells.
@ -107,6 +125,23 @@ class FileTape : public Tape {
*/ */
FileTape(size_t cells, FileTapeSettings settings = FT_DEFAULT_SETTINGS); FileTape(size_t cells, FileTapeSettings settings = FT_DEFAULT_SETTINGS);
/**
* Initializes a new instance of FileTape that will create and open a
* temporary file and fill it with zeroed cells -
* the amount of cells will be the same as the specified FileTape. File must
* not exist and directory must be write accessible.
*
* This is necessary in the situations when the FileTapse size is a bigger
* than size_t max value. Seeking will not introduce delay during this
* operation.
*
* @param same_size_as FileTape, the size of which will be carried over to
* the new FileTape
* @param file_name Name of a file to be created and opened
* @param settings FileTape settings
*/
FileTape(FileTape &same_size_as, FileTapeSettings = FT_DEFAULT_SETTINGS);
/** /**
* Initializes a new instance of FileTape that will create and open a * Initializes a new instance of FileTape that will create and open a
* temporary file, and fill it with the data from the provided vector. * temporary file, and fill it with the data from the provided vector.
@ -169,4 +204,4 @@ class FileTape : public Tape {
} // namespace tape } // namespace tape
#endif #endif // !FILE_TAPE_H

View file

@ -62,4 +62,4 @@ class Tape {
} // namespace tape } // namespace tape
#endif #endif // !TAPE_H

View file

@ -1,12 +1,47 @@
#ifndef TAPE_UTIL_H #ifndef TAPE_UTIL_H
#define TAPE_UTIL_H #define TAPE_UTIL_H
#include "filetape.h"
#include "tape.h" #include "tape.h"
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
using std::unique_ptr;
namespace tape { namespace tape {
void sort(Tape &input, Tape &output); /**
* Lambda, which takes the amount of cells a tape should have and returns
* a unique pointer to a temporary Tape with the specified amount of cells
*/
using TempTapeFactory = std::function<unique_ptr<Tape>(size_t)>;
/**
* TempTapeFactory that creates a new temporary FileTape
*/
const static inline TempTapeFactory FILETAPE_FACTORY =
[](size_t cells) -> unique_ptr<Tape> {
return std::make_unique<FileTape>(FileTape(cells));
};
/**
* External sort for Tape. Reads *ram_limit* cells into memory, sorts them,
* writes them to a temporary Tape, does this until all data from input Tape is
* sorted and put into temporary Tapes, then merges temporary Tapes sorted data
* into output Tape. Heap sort is preffered due to reduction in seek operations.
*
* @param input Tape, data of which should be sorted
* @param output Tape, where the sorted data is going to be written
* @param tmp_tape_factory Lambda expression that defines how new temporary
* Tapes are created
* @param sort_limit Max amount of cells that can be loaded into memory to use
* while sorting. Set to SIZE_MAX by default
*/
void external_sort(Tape &input, Tape &output, TempTapeFactory tmp_tape_factory,
size_t sort_limit = SIZE_MAX);
} // namespace tape } // namespace tape
#endif #endif // !TAPE_UTIL_H

View file

@ -1,7 +1,9 @@
#include "filetape.h" #include "filetape.h"
#include <cstddef>
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <iostream>
#include <string> #include <string>
#include <thread> #include <thread>
@ -17,37 +19,50 @@ std::string generate_tmp_file_name() {
return tmp_file_name.data(); return tmp_file_name.data();
} }
void fill_file_with_empty_cells(size_t cells, fstream &file) { FileTape::FileTape(const std::string file_name, const FileTapeSettings settings)
file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0';
for (size_t i = 1; i < cells; i++) {
file << tape::FT_DELIMETER << std::setfill('0')
<< std::setw(tape::FT_CELL_SIZE) << '0';
}
file.seekg(0, fstream::beg);
}
FileTape::FileTape(std::string file_name, FileTapeSettings settings)
: settings(settings), file_name(file_name) { : settings(settings), file_name(file_name) {
this->file = fstream(file_name); this->file = fstream(file_name);
} }
FileTape::FileTape(size_t cells, std::string file_name, FileTape::FileTape(const size_t cells, const std::string file_name,
FileTapeSettings settings) const FileTapeSettings settings)
: FileTape(file_name, settings) { : FileTape(file_name, settings) {
this->file = this->file =
fstream(this->file_name, fstream::in | fstream::out | fstream::trunc); fstream(this->file_name, fstream::in | fstream::out | fstream::trunc);
fill_file_with_empty_cells(cells, this->file); this->file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0';
for (size_t i = 1; i < cells; i++) {
this->file << FT_DELIMETER << std::setfill('0')
<< std::setw(FT_CELL_SIZE) << '0';
}
this->file.seekg(0, fstream::beg);
} }
FileTape::FileTape(size_t cells, FileTapeSettings settings) FileTape::FileTape(const size_t cells, const FileTapeSettings settings)
: settings(settings), tmp(true) { : tape::FileTape(cells, generate_tmp_file_name(), settings) {
this->file_name = generate_tmp_file_name(); this->tmp = true;
}
FileTape::FileTape(FileTape &same_size_as, const std::string file_name,
const FileTapeSettings settings)
: settings(settings), file_name(file_name) {
this->file = this->file =
fstream(this->file_name, fstream::in | fstream::out | fstream::trunc); fstream(this->file_name, fstream::in | fstream::out | fstream::trunc);
fill_file_with_empty_cells(cells, this->file); this->file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0';
while (same_size_as.seek_forward()) {
file << tape::FT_DELIMETER << std::setfill('0')
<< std::setw(tape::FT_CELL_SIZE) << '0';
}
this->file.seekg(0, fstream::beg);
same_size_as.file.seekg(0, fstream::beg);
} }
FileTape::FileTape(const std::vector<uint32_t> &data, FileTapeSettings settings) FileTape::FileTape(FileTape &same_size_as, const FileTapeSettings settings)
: tape::FileTape(same_size_as, generate_tmp_file_name(), settings) {
this->tmp = true;
}
FileTape::FileTape(const std::vector<uint32_t> &data,
const FileTapeSettings settings)
: settings(settings), tmp(true) { : settings(settings), tmp(true) {
this->file_name = generate_tmp_file_name(); this->file_name = generate_tmp_file_name();
this->file = this->file =
@ -98,7 +113,7 @@ uint32_t FileTape::read() {
return data; return data;
} }
void FileTape::write(uint32_t data) { void FileTape::write(const uint32_t data) {
std::this_thread::sleep_for(this->settings.write_delay); std::this_thread::sleep_for(this->settings.write_delay);
std::stringstream cell; std::stringstream cell;
cell << std::setfill('0') << std::setw(FT_CELL_SIZE) cell << std::setfill('0') << std::setw(FT_CELL_SIZE)

View file

@ -1,3 +1,104 @@
#include "tape_util.h" #include "tape_util.h"
#include <algorithm>
void tape::sort(Tape &input, Tape &output) {} using std::unique_ptr;
using std::vector;
using tape::Tape;
struct TapeHeapNode {
unique_ptr<Tape> tape;
uint32_t value;
};
class TapeHeapNodeComparator {
public:
bool operator()(const TapeHeapNode &obj1, const TapeHeapNode &obj2) {
return obj1.value > obj2.value;
}
};
// min heap implementation for tape heap nodes
// cant use priority queue since you cant pop the element and push it back
// without doing some pointer magic
class TapeHeap {
// define comparator to use in heap
constexpr static TapeHeapNodeComparator COMPARATOR =
TapeHeapNodeComparator();
private:
vector<TapeHeapNode> elements;
public:
void push(TapeHeapNode element) {
elements.push_back(std::move(element));
std::ranges::push_heap(elements, COMPARATOR);
}
TapeHeapNode pop() {
std::ranges::pop_heap(elements, COMPARATOR);
TapeHeapNode element = std::move(elements.back());
elements.pop_back();
return element;
}
bool empty() { return this->elements.empty(); }
};
void tape::external_sort(Tape &input, Tape &output,
TempTapeFactory tmp_tape_factory,
const size_t sort_limit) {
vector<uint32_t> data_buf;
TapeHeap heap;
bool input_ended = false;
bool all_data_in_memory = false;
while (!input_ended) {
for (size_t i = 0; i < sort_limit; i++) {
data_buf.push_back(input.read());
if (!input.seek_forward()) {
input_ended = true;
// if all input data was put into buffer in one go,
// prevent creation of external Tapes by settings
// all_data_in_memory flag to true
if (heap.empty()) {
all_data_in_memory = true;
}
break;
}
}
if (all_data_in_memory) {
std::ranges::sort(data_buf);
for (uint32_t num : data_buf) {
output.write(num);
output.seek_forward();
}
return;
}
// in order to avoid seeking twice as much in tmp tapes - sort the data
// in descending order. this way - the data will be read as though it is
// ascending, because we will be reading it from the end, while seeking
// backwards
std::ranges::sort(data_buf, std::greater());
unique_ptr<Tape> tmp_tape = tmp_tape_factory(sort_limit);
tmp_tape->write(data_buf[0]);
for (size_t i = 1; i < data_buf.size(); i++) {
tmp_tape->seek_forward();
tmp_tape->write(data_buf[i]);
}
heap.push(TapeHeapNode{.tape = std::move(tmp_tape),
.value = data_buf[data_buf.size() - 1]});
data_buf.clear();
}
// take a top tape node from the heap, write its value into the output,
// update the value in the node and push it back. if a node is out of values
// - discard it. do this until heap is empty. all the elements will
// eventually end up sorted in the output
while (!heap.empty()) {
TapeHeapNode node = heap.pop();
output.write(node.value);
output.seek_forward();
if (node.tape->seek_backwards()) {
node.value = node.tape->read();
heap.push(std::move(node)); // move is required to avoid copying
}
}
}

View file

@ -5,3 +5,9 @@ target_link_libraries(filetape_tests PRIVATE tapelib)
target_link_libraries(filetape_tests PRIVATE Catch2::Catch2WithMain) target_link_libraries(filetape_tests PRIVATE Catch2::Catch2WithMain)
add_test(filetape_tests filetape_tests) add_test(filetape_tests filetape_tests)
add_executable(filetape_sort_tests ${PROJECT_SOURCE_DIR}/tests/filetape_sort_tests.cpp)
target_link_libraries(filetape_sort_tests PRIVATE tapelib)
target_link_libraries(filetape_sort_tests PRIVATE Catch2::Catch2WithMain)
add_test(filetape_sort_tests filetape_sort_tests)

View file

@ -0,0 +1,55 @@
#include "tape_util.h"
#include <algorithm>
#include <catch2/catch_test_macros.hpp>
const static std::vector<uint32_t> TEST_DATA = {123, 26, 87, 266, 111, 234,
6, 63, 28, 1, 90, 33};
const static std::vector<uint32_t> TEST_DATA_SORTED = {
1, 6, 26, 28, 33, 63, 87, 90, 111, 123, 234, 266};
std::vector<uint32_t> read_from_tape_backwards(tape::Tape &tape) {
std::vector<uint32_t> data;
do {
data.push_back(tape.read());
} while (tape.seek_backwards());
std::ranges::reverse(data);
return data;
}
// NOLINTBEGIN(*-magic-numbers)
TEST_CASE("Sorting FileTape with external sort", "[sort]") {
tape::FileTape input(TEST_DATA);
tape::FileTape output(input);
SECTION("Sorting with no limitations") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY);
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
SECTION("Sorting with 4 elements in memory limit") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY, 4);
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
SECTION("Sorting with 9 elements in memory limit") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY, 9);
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
SECTION("Sorting with vector size elements in memory limit") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY,
TEST_DATA.size());
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
SECTION("Sorting with 1000 elements in memory limit") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY, 1000);
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
SECTION("Sorting with 1 elements in memory limit") {
tape::external_sort(input, output, tape::FILETAPE_FACTORY, 1);
REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED);
}
}
// NOLINTEND(*-magic-numbers)

View file

@ -1,6 +1,5 @@
#include "filetape.h" #include "filetape.h"
#include <catch2/catch_test_macros.hpp> #include <catch2/catch_test_macros.hpp>
#include <iostream>
#include <vector> #include <vector>
const static std::vector<uint32_t> TEST_DATA = {1, 12345, 0, 2222222222, const static std::vector<uint32_t> TEST_DATA = {1, 12345, 0, 2222222222,