diff --git a/externalsort/README.md b/externalsort/README.md new file mode 100644 index 0000000..4e93e9d --- /dev/null +++ b/externalsort/README.md @@ -0,0 +1,59 @@ +## externalsort + +В этой задаче нужно написать однопроходную внешнюю сортировку слиянием. +Моделируется ситуация, в которой, данные расположены на внешних устройствах и суммарно не вмещаются в оперативную память, +но каждый кусочек по-отдельности вмещается. + +Задача разбита на 3 составные части. + +#### Reader & writer + +Реализовать интерфейсы для построчного чтения/записи строк: +``` +type LineReader interface { + ReadLine() (string, error) +} + +type LineWriterFlusher interface { + Write(l string) error + Flush() error +} +``` +и два конструктора: +``` +func NewReader(r io.Reader) LineReader +func NewWriterFlusher(w io.Writer) LineWriterFlusher +``` + +`NewLineReader` оборачивает переданный `io.Reader` в `LineReader`. + +Вызов `ReadLine` должен читать одну строку. +Строка имеет произвольную длину. +Конец строки определяется переводом строки ('\n'). +Непустая последовательность символов после последнего перевода строки также считается строкой. + +`ReadLine` должен возращать `io.EOF` при достижении конца файла. + +#### Merge + +Функция слияния произвольного количества отсортированных групп строк: +``` +func Merge(w LineWriterFlusher, readers ...LineReader) error +``` + +`Merge` по необходимости читает из reader'ов и пишет во writer. + +#### Sort + +``` +Sort(w io.Writer, in ...string) error +``` + +Функция принимает на вход произвольное количество файлов, каждый их которых помещается в оперативную память, +а также writer для записи результата. + +Результаты сортировки отдельных файлов можно записывать поверх входных данных. + +### Ссылки + +* container/heap: https://golang.org/pkg/container/heap/ diff --git a/externalsort/io.go b/externalsort/io.go new file mode 100644 index 0000000..9867cb8 --- /dev/null +++ b/externalsort/io.go @@ -0,0 +1,12 @@ +// +build !change + +package externalsort + +type LineReader interface { + ReadLine() (string, error) +} + +type LineWriterFlusher interface { + Write(l string) error + Flush() error +} diff --git a/externalsort/io_test.go b/externalsort/io_test.go new file mode 100644 index 0000000..7ff8c19 --- /dev/null +++ b/externalsort/io_test.go @@ -0,0 +1,147 @@ +package externalsort + +import ( + "bytes" + "errors" + "io" + "strings" + "testing" + "testing/iotest" + + "github.com/stretchr/testify/require" +) + +func newStringReader(s string) LineReader { + return NewReader(strings.NewReader(s)) +} + +func readAll(r LineReader) (lines []string, err error) { + for { + l, err := r.ReadLine() + if err != nil { + if errors.Is(err, io.EOF) { + return lines, nil + } + return nil, err + } + lines = append(lines, l) + } +} + +func TestLineReader(t *testing.T) { + type Wrapper func(r io.Reader) io.Reader + + for _, tc := range []struct { + name string + in string + wrappers []Wrapper + expected []string + }{ + { + name: "empty", + in: "", + expected: []string{""}, + }, + { + name: "one-row", + in: "abc", + expected: []string{"abc"}, + }, + { + name: "multiple-rows", + in: `a + +b +b +`, + expected: []string{"a\n", "\n", "b\n", "b\n", ""}, + }, + { + name: "large-row", + in: strings.Repeat("a", 4097), + expected: []string{strings.Repeat("a", 4097)}, + }, + { + name: "huge-row", + in: strings.Repeat("a", 65537), + expected: []string{strings.Repeat("a", 65537)}, + }, + { + name: "half-reader", + in: strings.Repeat("a", 1025), + wrappers: []Wrapper{iotest.HalfReader}, + expected: []string{strings.Repeat("a", 1025)}, + }, + { + name: "eof", + in: strings.Repeat("a", 1025), + wrappers: []Wrapper{iotest.DataErrReader}, + expected: []string{strings.Repeat("a", 1025)}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + var r io.Reader + r = strings.NewReader(tc.in) + for _, w := range tc.wrappers { + r = w(r) + } + + lineReader := NewReader(r) + + lines, err := readAll(lineReader) + require.NoError(t, err) + + require.Equal(t, strings.Join(lines, ""), tc.in) + require.Len(t, lines, len(tc.expected), + "expected: %+v, got: %+v", tc.expected, lines) + require.Equal(t, tc.expected, lines) + }) + } +} + +type brokenReader int + +func (b brokenReader) Read(data []byte) (n int, err error) { + return 0, errors.New("read is broken") +} + +func TestLineReader_error(t *testing.T) { + _, err := NewReader(new(brokenReader)).ReadLine() + require.Error(t, err) + require.False(t, errors.Is(err, io.EOF)) + + r := newStringReader("") + _, err = r.ReadLine() + require.NoError(t, err) + + _, err = r.ReadLine() + require.True(t, errors.Is(err, io.EOF)) +} + +func TestLineWriterFlusher(t *testing.T) { + for _, tc := range []struct { + name string + lines []string + }{ + { + name: "empty", + }, + { + name: "simple", + lines: []string{"a\n", "b\n", "c\n"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + var buf bytes.Buffer + w := NewWriterFlusher(&buf) + + for _, l := range tc.lines { + require.NoError(t, w.Write(l)) + } + + require.NoError(t, w.Flush()) + + require.Equal(t, strings.Join(tc.lines, ""), buf.String()) + }) + } +} diff --git a/externalsort/sort.go b/externalsort/sort.go new file mode 100644 index 0000000..5760d49 --- /dev/null +++ b/externalsort/sort.go @@ -0,0 +1,23 @@ +// +build !solution + +package externalsort + +import ( + "io" +) + +func NewReader(r io.Reader) LineReader { + panic("implement me") +} + +func NewWriterFlusher(w io.Writer) LineWriterFlusher { + panic("implement me") +} + +func Merge(w LineWriterFlusher, readers ...LineReader) error { + panic("implement me") +} + +func Sort(w io.Writer, in ...string) error { + panic("implement me") +} diff --git a/externalsort/sort_test.go b/externalsort/sort_test.go new file mode 100644 index 0000000..054dce1 --- /dev/null +++ b/externalsort/sort_test.go @@ -0,0 +1,113 @@ +package externalsort + +import ( + "bytes" + "io/ioutil" + "path" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/slon/shad-go/tools/testtool" +) + +func TestMerge(t *testing.T) { + for _, tc := range []struct { + name string + in []string + out string + }{ + { + name: "simple", + in: []string{`0`, `1 +1 +1`}, + out: `0 +1 +1 +1`, + }, + { + // Merge believes lines are read in sorted order. + name: "single-unsorted-file", + in: []string{`1 +0`}, + out: `1 +0`, + }, + } { + t.Run(tc.name, func(t *testing.T) { + out := &bytes.Buffer{} + w := NewWriterFlusher(out) + + var readers []LineReader + for _, s := range tc.in { + readers = append(readers, newStringReader(s)) + } + + err := Merge(w, readers...) + require.NoError(t, err) + + require.Equal(t, tc.out, out.String()) + }) + } +} + +func TestSort_fileNotFound(t *testing.T) { + var buf bytes.Buffer + err := Sort(&buf, testtool.RandomName()) + require.Error(t, err) +} + +func TestSort(t *testing.T) { + testDir := path.Join("./testdata", "sort") + + readTestCase := func(dir string) (in []string, out string) { + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + for _, f := range files { + if strings.HasPrefix(f.Name(), "in") { + in = append(in, path.Join(dir, f.Name())) + } + if f.Name() == "out.txt" { + out = path.Join(dir, f.Name()) + } + } + + return + } + + for _, d := range listDirs(t, testDir) { + testCaseDir := path.Join(testDir, d) + + t.Run(d, func(t *testing.T) { + in, out := readTestCase(testCaseDir) + + var buf bytes.Buffer + err := Sort(&buf, in...) + require.NoError(t, err) + + expected, err := ioutil.ReadFile(out) + require.NoError(t, err) + + require.Equal(t, string(expected), buf.String()) + }) + } +} + +func listDirs(t *testing.T, dir string) []string { + t.Helper() + + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + var dirs []string + for _, f := range files { + if f.IsDir() { + dirs = append(dirs, f.Name()) + } + } + + return dirs +} diff --git a/externalsort/testdata/sort/1/in1.txt b/externalsort/testdata/sort/1/in1.txt new file mode 100644 index 0000000..f2ba8f8 Binary files /dev/null and b/externalsort/testdata/sort/1/in1.txt differ diff --git a/externalsort/testdata/sort/1/out.txt b/externalsort/testdata/sort/1/out.txt new file mode 100644 index 0000000..f2ba8f8 Binary files /dev/null and b/externalsort/testdata/sort/1/out.txt differ diff --git a/externalsort/testdata/sort/2/in1.txt b/externalsort/testdata/sort/2/in1.txt new file mode 100644 index 0000000..31b838d Binary files /dev/null and b/externalsort/testdata/sort/2/in1.txt differ diff --git a/externalsort/testdata/sort/2/out.txt b/externalsort/testdata/sort/2/out.txt new file mode 100644 index 0000000..31b838d Binary files /dev/null and b/externalsort/testdata/sort/2/out.txt differ diff --git a/externalsort/testdata/sort/3/in1.txt b/externalsort/testdata/sort/3/in1.txt new file mode 100644 index 0000000..c227083 Binary files /dev/null and b/externalsort/testdata/sort/3/in1.txt differ diff --git a/externalsort/testdata/sort/3/in2.txt b/externalsort/testdata/sort/3/in2.txt new file mode 100644 index 0000000..ae7dd0b Binary files /dev/null and b/externalsort/testdata/sort/3/in2.txt differ diff --git a/externalsort/testdata/sort/3/in3.txt b/externalsort/testdata/sort/3/in3.txt new file mode 100644 index 0000000..6d9973c Binary files /dev/null and b/externalsort/testdata/sort/3/in3.txt differ diff --git a/externalsort/testdata/sort/3/out.txt b/externalsort/testdata/sort/3/out.txt new file mode 100644 index 0000000..b97a368 Binary files /dev/null and b/externalsort/testdata/sort/3/out.txt differ