From 33fee958a5ef231eeb9ddb006da04e8493d3bd58 Mon Sep 17 00:00:00 2001 From: erius Date: Sun, 27 Oct 2024 05:18:23 +0200 Subject: [PATCH] Removed clang-tidy check for do-while loops Added 2 new constructors for FileTape which copy the size of existing tape into a new one Implemented external sort of a Tape using heap sort Rewritten FileTape constructors implementation to better take advantage of existing constructor calls Added const to some parameters to indicate their immutability Added unit tests for external_sort --- .clang-tidy | 2 +- CMakeLists.txt | 1 + bin/ftsort.cpp | 1 - include/tapelib/filetape.h | 39 ++++++++++++- include/tapelib/tape.h | 2 +- include/tapelib/tape_util.h | 39 ++++++++++++- src/filetape.cpp | 53 ++++++++++------- src/tape_util.cpp | 103 +++++++++++++++++++++++++++++++++- tests/CMakeLists.txt | 6 ++ tests/filetape_sort_tests.cpp | 55 ++++++++++++++++++ tests/filetape_tests.cpp | 1 - 11 files changed, 274 insertions(+), 28 deletions(-) create mode 100644 tests/filetape_sort_tests.cpp diff --git a/.clang-tidy b/.clang-tidy index 07ccb91..3ea75e6 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,4 +1,4 @@ -Checks: 'clang-diagnostic-*,clang-analyzer-*,cppcoreguidelines-*,modernize-*,-modernize-use-trailing-return-type,readability-*' +Checks: 'clang-diagnostic-*,clang-analyzer-*,cppcoreguidelines-*,modernize-*,-modernize-use-trailing-return-type,readability-*,-cppcoreguidelines-avoid-do-while' CheckOptions: - { key: readability-identifier-naming.NamespaceCase, value: lower_case } - { key: readability-identifier-naming.ClassCase, value: CamelCase } diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ae64bf..1c99a36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ project(yadro-task VERSION 0.1 LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_BUILD_TYPE Debug) # clang-tidy find_program(CLANG_TIDY_EXE NAMES clang-tidy REQUIRED) diff --git a/bin/ftsort.cpp b/bin/ftsort.cpp index c5ca49e..f8dde37 100644 --- a/bin/ftsort.cpp +++ b/bin/ftsort.cpp @@ -1,5 +1,4 @@ #include -#include int main(int argc, char *argv[]) { std::cout << "Hello, World!" << std::endl; diff --git a/include/tapelib/filetape.h b/include/tapelib/filetape.h index 557cd2e..96f6ecb 100644 --- a/include/tapelib/filetape.h +++ b/include/tapelib/filetape.h @@ -41,7 +41,7 @@ const static int FT_SEEK_OFFSET = FT_CELL_SIZE + 1; * configuration file. Each cell is represented as a plain text 32-bit unsigned * int, padded to FT_CELL_SIZE with leading zeroes, separated by FT_DELIMETER. * Each cell MUST be FT_CELL_SIZE long to correctly implement write operation - * without re-writing the entire file. FT_DELIMETER + * without re-writing the entire file. * * Example of a valid file for a FileTape * - example.txt @@ -95,6 +95,24 @@ class FileTape : public Tape { FileTape(size_t cells, std::string file_name, FileTapeSettings settings = FT_DEFAULT_SETTINGS); + /** + * Initializes a new instance of FileTape that will create and open a file + * with the specified name and fill it with zeroed cells - the amount of + * cells will be the same as the specified FileTape. + * File must not exist and directory must be write accessible. + * + * This is necessary in the situations when the FileTapse size is a bigger + * than size_t max value. Seeking will not introduce delay during this + * operation. + * + * @param same_size_as FileTape, the size of which will be carried over to + * the new FileTape + * @param file_name Name of a file to be created and opened + * @param settings FileTape settings + */ + FileTape(FileTape &same_size_as, std::string file_name, + FileTapeSettings = FT_DEFAULT_SETTINGS); + /** * Initializes a new instance of FileTape that will create and open a * temporary file and fill it with *cells* amount of zeroed cells. @@ -107,6 +125,23 @@ class FileTape : public Tape { */ FileTape(size_t cells, FileTapeSettings settings = FT_DEFAULT_SETTINGS); + /** + * Initializes a new instance of FileTape that will create and open a + * temporary file and fill it with zeroed cells - + * the amount of cells will be the same as the specified FileTape. File must + * not exist and directory must be write accessible. + * + * This is necessary in the situations when the FileTapse size is a bigger + * than size_t max value. Seeking will not introduce delay during this + * operation. + * + * @param same_size_as FileTape, the size of which will be carried over to + * the new FileTape + * @param file_name Name of a file to be created and opened + * @param settings FileTape settings + */ + FileTape(FileTape &same_size_as, FileTapeSettings = FT_DEFAULT_SETTINGS); + /** * Initializes a new instance of FileTape that will create and open a * temporary file, and fill it with the data from the provided vector. @@ -169,4 +204,4 @@ class FileTape : public Tape { } // namespace tape -#endif +#endif // !FILE_TAPE_H diff --git a/include/tapelib/tape.h b/include/tapelib/tape.h index b774043..b8c4729 100644 --- a/include/tapelib/tape.h +++ b/include/tapelib/tape.h @@ -62,4 +62,4 @@ class Tape { } // namespace tape -#endif +#endif // !TAPE_H diff --git a/include/tapelib/tape_util.h b/include/tapelib/tape_util.h index 2b6e74e..835b1cd 100644 --- a/include/tapelib/tape_util.h +++ b/include/tapelib/tape_util.h @@ -1,12 +1,47 @@ #ifndef TAPE_UTIL_H #define TAPE_UTIL_H +#include "filetape.h" #include "tape.h" +#include +#include +#include +#include + +using std::unique_ptr; namespace tape { -void sort(Tape &input, Tape &output); +/** + * Lambda, which takes the amount of cells a tape should have and returns + * a unique pointer to a temporary Tape with the specified amount of cells + */ +using TempTapeFactory = std::function(size_t)>; + +/** + * TempTapeFactory that creates a new temporary FileTape + */ +const static inline TempTapeFactory FILETAPE_FACTORY = + [](size_t cells) -> unique_ptr { + return std::make_unique(FileTape(cells)); +}; + +/** + * External sort for Tape. Reads *ram_limit* cells into memory, sorts them, + * writes them to a temporary Tape, does this until all data from input Tape is + * sorted and put into temporary Tapes, then merges temporary Tapes sorted data + * into output Tape. Heap sort is preffered due to reduction in seek operations. + * + * @param input Tape, data of which should be sorted + * @param output Tape, where the sorted data is going to be written + * @param tmp_tape_factory Lambda expression that defines how new temporary + * Tapes are created + * @param sort_limit Max amount of cells that can be loaded into memory to use + * while sorting. Set to SIZE_MAX by default + */ +void external_sort(Tape &input, Tape &output, TempTapeFactory tmp_tape_factory, + size_t sort_limit = SIZE_MAX); } // namespace tape -#endif +#endif // !TAPE_UTIL_H diff --git a/src/filetape.cpp b/src/filetape.cpp index f434ae1..b42051b 100644 --- a/src/filetape.cpp +++ b/src/filetape.cpp @@ -1,7 +1,9 @@ #include "filetape.h" +#include #include #include #include +#include #include #include @@ -17,37 +19,50 @@ std::string generate_tmp_file_name() { return tmp_file_name.data(); } -void fill_file_with_empty_cells(size_t cells, fstream &file) { - file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0'; - for (size_t i = 1; i < cells; i++) { - file << tape::FT_DELIMETER << std::setfill('0') - << std::setw(tape::FT_CELL_SIZE) << '0'; - } - file.seekg(0, fstream::beg); -} - -FileTape::FileTape(std::string file_name, FileTapeSettings settings) +FileTape::FileTape(const std::string file_name, const FileTapeSettings settings) : settings(settings), file_name(file_name) { this->file = fstream(file_name); } -FileTape::FileTape(size_t cells, std::string file_name, - FileTapeSettings settings) +FileTape::FileTape(const size_t cells, const std::string file_name, + const FileTapeSettings settings) : FileTape(file_name, settings) { this->file = fstream(this->file_name, fstream::in | fstream::out | fstream::trunc); - fill_file_with_empty_cells(cells, this->file); + this->file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0'; + for (size_t i = 1; i < cells; i++) { + this->file << FT_DELIMETER << std::setfill('0') + << std::setw(FT_CELL_SIZE) << '0'; + } + this->file.seekg(0, fstream::beg); } -FileTape::FileTape(size_t cells, FileTapeSettings settings) - : settings(settings), tmp(true) { - this->file_name = generate_tmp_file_name(); +FileTape::FileTape(const size_t cells, const FileTapeSettings settings) + : tape::FileTape(cells, generate_tmp_file_name(), settings) { + this->tmp = true; +} + +FileTape::FileTape(FileTape &same_size_as, const std::string file_name, + const FileTapeSettings settings) + : settings(settings), file_name(file_name) { this->file = fstream(this->file_name, fstream::in | fstream::out | fstream::trunc); - fill_file_with_empty_cells(cells, this->file); + this->file << std::setfill('0') << std::setw(tape::FT_CELL_SIZE) << '0'; + while (same_size_as.seek_forward()) { + file << tape::FT_DELIMETER << std::setfill('0') + << std::setw(tape::FT_CELL_SIZE) << '0'; + } + this->file.seekg(0, fstream::beg); + same_size_as.file.seekg(0, fstream::beg); } -FileTape::FileTape(const std::vector &data, FileTapeSettings settings) +FileTape::FileTape(FileTape &same_size_as, const FileTapeSettings settings) + : tape::FileTape(same_size_as, generate_tmp_file_name(), settings) { + this->tmp = true; +} + +FileTape::FileTape(const std::vector &data, + const FileTapeSettings settings) : settings(settings), tmp(true) { this->file_name = generate_tmp_file_name(); this->file = @@ -98,7 +113,7 @@ uint32_t FileTape::read() { return data; } -void FileTape::write(uint32_t data) { +void FileTape::write(const uint32_t data) { std::this_thread::sleep_for(this->settings.write_delay); std::stringstream cell; cell << std::setfill('0') << std::setw(FT_CELL_SIZE) diff --git a/src/tape_util.cpp b/src/tape_util.cpp index 38217b8..509cb1d 100644 --- a/src/tape_util.cpp +++ b/src/tape_util.cpp @@ -1,3 +1,104 @@ #include "tape_util.h" +#include -void tape::sort(Tape &input, Tape &output) {} +using std::unique_ptr; +using std::vector; +using tape::Tape; + +struct TapeHeapNode { + unique_ptr tape; + uint32_t value; +}; + +class TapeHeapNodeComparator { + public: + bool operator()(const TapeHeapNode &obj1, const TapeHeapNode &obj2) { + return obj1.value > obj2.value; + } +}; + +// min heap implementation for tape heap nodes +// cant use priority queue since you cant pop the element and push it back +// without doing some pointer magic +class TapeHeap { + // define comparator to use in heap + constexpr static TapeHeapNodeComparator COMPARATOR = + TapeHeapNodeComparator(); + + private: + vector elements; + + public: + void push(TapeHeapNode element) { + elements.push_back(std::move(element)); + std::ranges::push_heap(elements, COMPARATOR); + } + + TapeHeapNode pop() { + std::ranges::pop_heap(elements, COMPARATOR); + TapeHeapNode element = std::move(elements.back()); + elements.pop_back(); + return element; + } + + bool empty() { return this->elements.empty(); } +}; + +void tape::external_sort(Tape &input, Tape &output, + TempTapeFactory tmp_tape_factory, + const size_t sort_limit) { + vector data_buf; + TapeHeap heap; + bool input_ended = false; + bool all_data_in_memory = false; + while (!input_ended) { + for (size_t i = 0; i < sort_limit; i++) { + data_buf.push_back(input.read()); + if (!input.seek_forward()) { + input_ended = true; + // if all input data was put into buffer in one go, + // prevent creation of external Tapes by settings + // all_data_in_memory flag to true + if (heap.empty()) { + all_data_in_memory = true; + } + break; + } + } + if (all_data_in_memory) { + std::ranges::sort(data_buf); + for (uint32_t num : data_buf) { + output.write(num); + output.seek_forward(); + } + return; + } + // in order to avoid seeking twice as much in tmp tapes - sort the data + // in descending order. this way - the data will be read as though it is + // ascending, because we will be reading it from the end, while seeking + // backwards + std::ranges::sort(data_buf, std::greater()); + unique_ptr tmp_tape = tmp_tape_factory(sort_limit); + tmp_tape->write(data_buf[0]); + for (size_t i = 1; i < data_buf.size(); i++) { + tmp_tape->seek_forward(); + tmp_tape->write(data_buf[i]); + } + heap.push(TapeHeapNode{.tape = std::move(tmp_tape), + .value = data_buf[data_buf.size() - 1]}); + data_buf.clear(); + } + // take a top tape node from the heap, write its value into the output, + // update the value in the node and push it back. if a node is out of values + // - discard it. do this until heap is empty. all the elements will + // eventually end up sorted in the output + while (!heap.empty()) { + TapeHeapNode node = heap.pop(); + output.write(node.value); + output.seek_forward(); + if (node.tape->seek_backwards()) { + node.value = node.tape->read(); + heap.push(std::move(node)); // move is required to avoid copying + } + } +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7bb6c28..3a1814d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,3 +5,9 @@ target_link_libraries(filetape_tests PRIVATE tapelib) target_link_libraries(filetape_tests PRIVATE Catch2::Catch2WithMain) add_test(filetape_tests filetape_tests) + +add_executable(filetape_sort_tests ${PROJECT_SOURCE_DIR}/tests/filetape_sort_tests.cpp) +target_link_libraries(filetape_sort_tests PRIVATE tapelib) +target_link_libraries(filetape_sort_tests PRIVATE Catch2::Catch2WithMain) + +add_test(filetape_sort_tests filetape_sort_tests) diff --git a/tests/filetape_sort_tests.cpp b/tests/filetape_sort_tests.cpp new file mode 100644 index 0000000..e38e077 --- /dev/null +++ b/tests/filetape_sort_tests.cpp @@ -0,0 +1,55 @@ +#include "tape_util.h" +#include +#include + +const static std::vector TEST_DATA = {123, 26, 87, 266, 111, 234, + 6, 63, 28, 1, 90, 33}; +const static std::vector TEST_DATA_SORTED = { + 1, 6, 26, 28, 33, 63, 87, 90, 111, 123, 234, 266}; + +std::vector read_from_tape_backwards(tape::Tape &tape) { + std::vector data; + do { + data.push_back(tape.read()); + } while (tape.seek_backwards()); + std::ranges::reverse(data); + return data; +} + +// NOLINTBEGIN(*-magic-numbers) +TEST_CASE("Sorting FileTape with external sort", "[sort]") { + tape::FileTape input(TEST_DATA); + tape::FileTape output(input); + + SECTION("Sorting with no limitations") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } + + SECTION("Sorting with 4 elements in memory limit") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY, 4); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } + + SECTION("Sorting with 9 elements in memory limit") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY, 9); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } + + SECTION("Sorting with vector size elements in memory limit") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY, + TEST_DATA.size()); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } + + SECTION("Sorting with 1000 elements in memory limit") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY, 1000); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } + + SECTION("Sorting with 1 elements in memory limit") { + tape::external_sort(input, output, tape::FILETAPE_FACTORY, 1); + REQUIRE(read_from_tape_backwards(output) == TEST_DATA_SORTED); + } +} +// NOLINTEND(*-magic-numbers) diff --git a/tests/filetape_tests.cpp b/tests/filetape_tests.cpp index d842523..ceb0e0a 100644 --- a/tests/filetape_tests.cpp +++ b/tests/filetape_tests.cpp @@ -1,6 +1,5 @@ #include "filetape.h" #include -#include #include const static std::vector TEST_DATA = {1, 12345, 0, 2222222222,