diff --git a/externalsort/README.md b/externalsort/README.md new file mode 100644 index 0000000..51de944 --- /dev/null +++ b/externalsort/README.md @@ -0,0 +1,58 @@ +## externalsort + +В этой задаче нужно написать однопроходную внешнюю сортировку слиянием. +Моделируется ситуация, в которой, данные расположены на внешних устройствах и суммарно не вмещаются в оперативную память, +но каждый кусочек по-отдельности вмещается. + +Задача разбита на 3 составные части. + +#### Reader & writer + +Реализовать интерфейсы для построчного чтения/записи строк: +``` +type LineReader interface { + ReadLine() (string, error) +} + +type LineWriter interface { + Write(l string) error +} +``` +и два конструктора: +``` +func NewReader(r io.Reader) LineReader +func NewWriter(w io.Writer) LineWriter +``` + +`NewLineReader` оборачивает переданный `io.Reader` в `LineReader`. + +Вызов `ReadLine` должен читать одну строку. +Строка имеет произвольную длину. +Конец строки определяется переводом строки ('\n'). +Непустая последовательность символов после последнего перевода строки также считается строкой. + +`ReadLine` должен возращать `io.EOF` при достижении конца файла. + +#### Merge + +Функция слияния произвольного количества отсортированных групп строк: +``` +func Merge(w LineWriter, readers ...LineReader) error +``` + +`Merge` по необходимости читает из reader'ов и пишет во writer. + +#### Sort + +``` +Sort(w io.Writer, in ...string) error +``` + +Функция принимает на вход произвольное количество файлов, каждый их которых помещается в оперативную память, +а также writer для записи результата. + +Результаты сортировки отдельных файлов можно записывать поверх входных данных. + +### Ссылки + +* container/heap: https://golang.org/pkg/container/heap/ diff --git a/externalsort/io.go b/externalsort/io.go new file mode 100644 index 0000000..13f68ce --- /dev/null +++ b/externalsort/io.go @@ -0,0 +1,11 @@ +// +build !change + +package externalsort + +type LineReader interface { + ReadLine() (string, error) +} + +type LineWriter interface { + Write(l string) error +} diff --git a/externalsort/io_test.go b/externalsort/io_test.go new file mode 100644 index 0000000..129677f --- /dev/null +++ b/externalsort/io_test.go @@ -0,0 +1,167 @@ +package externalsort + +import ( + "bufio" + "bytes" + "errors" + "io" + "strings" + "testing" + "testing/iotest" + + "github.com/stretchr/testify/require" +) + +func newStringReader(s string) LineReader { + return NewReader(strings.NewReader(s)) +} + +func readAll(r LineReader) (lines []string, err error) { + for { + l, err := r.ReadLine() + if err != nil { + if errors.Is(err, io.EOF) { + return lines, nil + } + return nil, err + } + lines = append(lines, l) + } +} + +func TestLineReader(t *testing.T) { + type Wrapper func(r io.Reader) io.Reader + + for _, tc := range []struct { + name string + in string + wrappers []Wrapper + expected []string + }{ + { + name: "empty", + in: "", + expected: nil, + }, + { + name: "one-row", + in: "abc", + expected: []string{"abc"}, + }, + { + name: "linebreak", + in: `abc + +`, + expected: []string{"abc", ""}, + }, + { + name: "multiple-rows", + in: `a + +b +b +`, + expected: []string{"a", "", "b", "b"}, + }, + { + name: "large-row", + in: strings.Repeat("a", 4097), + expected: []string{strings.Repeat("a", 4097)}, + }, + { + name: "huge-row", + in: strings.Repeat("a", 65537), + expected: []string{strings.Repeat("a", 65537)}, + }, + { + name: "half-reader", + in: strings.Repeat("a", 1025), + wrappers: []Wrapper{iotest.HalfReader}, + expected: []string{strings.Repeat("a", 1025)}, + }, + { + name: "eof", + in: strings.Repeat("a", 1025), + wrappers: []Wrapper{iotest.DataErrReader}, + expected: []string{strings.Repeat("a", 1025)}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + var r io.Reader + r = strings.NewReader(tc.in) + for _, w := range tc.wrappers { + r = w(r) + } + + lineReader := NewReader(r) + + lines, err := readAll(lineReader) + require.NoError(t, err) + + require.Len(t, lines, len(tc.expected), + "expected: %+v, got: %+v", tc.expected, lines) + require.Equal(t, tc.expected, lines) + }) + } +} + +type brokenReader int + +func (r brokenReader) Read(data []byte) (n int, err error) { + return 0, errors.New("read is broken") +} + +type eofReader int + +func (r eofReader) Read(p []byte) (n int, err error) { + return 0, io.EOF +} + +func TestLineReader_error(t *testing.T) { + _, err := NewReader(new(brokenReader)).ReadLine() + require.Error(t, err) + require.False(t, errors.Is(err, io.EOF)) + + r := NewReader(new(eofReader)) + _, err = r.ReadLine() + require.True(t, errors.Is(err, io.EOF)) +} + +func TestLineWriter(t *testing.T) { + for _, tc := range []struct { + name string + lines []string + }{ + { + name: "empty", + lines: []string{""}, + }, + { + name: "simple", + lines: []string{"a", "b", "c"}, + }, + { + name: "large-line", + lines: []string{strings.Repeat("xx", 2049), "x", "y"}, + }, + { + name: "huge-line", + lines: []string{strings.Repeat("?", 65537), "?", "!"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + var buf bytes.Buffer + w := bufio.NewWriter(&buf) + lw := NewWriter(w) + + for _, l := range tc.lines { + require.NoError(t, lw.Write(l)) + } + + require.NoError(t, w.Flush()) + expected := strings.Join(tc.lines, "\n") + "\n" + require.Equal(t, expected, buf.String()) + }) + } +} diff --git a/externalsort/sort.go b/externalsort/sort.go new file mode 100644 index 0000000..170d925 --- /dev/null +++ b/externalsort/sort.go @@ -0,0 +1,23 @@ +// +build !solution + +package externalsort + +import ( + "io" +) + +func NewReader(r io.Reader) LineReader { + panic("implement me") +} + +func NewWriter(w io.Writer) LineWriter { + panic("implement me") +} + +func Merge(w LineWriter, readers ...LineReader) error { + panic("implement me") +} + +func Sort(w io.Writer, in ...string) error { + panic("implement me") +} diff --git a/externalsort/sort_test.go b/externalsort/sort_test.go new file mode 100644 index 0000000..9499725 --- /dev/null +++ b/externalsort/sort_test.go @@ -0,0 +1,150 @@ +package externalsort + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "path" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/slon/shad-go/tools/testtool" +) + +func TestMerge(t *testing.T) { + for _, tc := range []struct { + name string + in []string + out string + }{ + { + name: "simple", + in: []string{`0`, `1 +1 +1`}, + out: `0 +1 +1 +1 +`, + }, + { + // Merge believes lines are read in sorted order. + name: "single-unsorted-file", + in: []string{`1 +0`}, + out: `1 +0 +`, + }, + } { + t.Run(tc.name, func(t *testing.T) { + out := &bytes.Buffer{} + w := bufio.NewWriter(out) + lw := NewWriter(w) + + var readers []LineReader + for _, s := range tc.in { + readers = append(readers, newStringReader(s)) + } + + err := Merge(lw, readers...) + require.NoError(t, err) + + require.NoError(t, w.Flush()) + require.Equal(t, tc.out, out.String()) + }) + } +} + +func TestSort_fileNotFound(t *testing.T) { + var buf bytes.Buffer + err := Sort(&buf, testtool.RandomName()) + require.Error(t, err) +} + +func TestSort(t *testing.T) { + testDir := path.Join("./testdata", "sort") + + readTestCase := func(dir string) (in []string, out string) { + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + for _, f := range files { + if strings.HasPrefix(f.Name(), "in") { + in = append(in, path.Join(dir, f.Name())) + } + if f.Name() == "out.txt" { + out = path.Join(dir, f.Name()) + } + } + + return + } + + for _, d := range listDirs(t, testDir) { + testCaseDir := path.Join(testDir, d) + + t.Run(d, func(t *testing.T) { + tmpDir, err := ioutil.TempDir("", fmt.Sprintf("sort%s-", d)) + require.NoError(t, err) + defer func() { _ = os.RemoveAll(tmpDir) }() + + in, out := readTestCase(testCaseDir) + in = copyFiles(t, in, tmpDir) + + var buf bytes.Buffer + w := bufio.NewWriter(&buf) + require.NoError(t, Sort(w, in...)) + + expected, err := ioutil.ReadFile(out) + require.NoError(t, err) + + require.NoError(t, w.Flush()) + require.Equal(t, string(expected), buf.String()) + }) + } +} + +func listDirs(t *testing.T, dir string) []string { + t.Helper() + + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + var dirs []string + for _, f := range files { + if f.IsDir() { + dirs = append(dirs, f.Name()) + } + } + + return dirs +} + +func copyFiles(t *testing.T, in []string, dir string) []string { + t.Helper() + + var ret []string + for _, f := range in { + ret = append(ret, copyFile(t, f, dir)) + } + + return ret +} + +func copyFile(t *testing.T, f, dir string) string { + t.Helper() + + data, err := ioutil.ReadFile(f) + require.NoError(t, err) + + dst := path.Join(dir, path.Base(f)) + err = ioutil.WriteFile(dst, data, 0644) + require.NoError(t, err) + + return dst +} diff --git a/externalsort/testdata/sort/1/in1.txt b/externalsort/testdata/sort/1/in1.txt new file mode 100644 index 0000000..f2ba8f8 Binary files /dev/null and b/externalsort/testdata/sort/1/in1.txt differ diff --git a/externalsort/testdata/sort/1/out.txt b/externalsort/testdata/sort/1/out.txt new file mode 100644 index 0000000..8baef1b Binary files /dev/null and b/externalsort/testdata/sort/1/out.txt differ diff --git a/externalsort/testdata/sort/2/in1.txt b/externalsort/testdata/sort/2/in1.txt new file mode 100644 index 0000000..8baef1b Binary files /dev/null and b/externalsort/testdata/sort/2/in1.txt differ diff --git a/externalsort/testdata/sort/2/out.txt b/externalsort/testdata/sort/2/out.txt new file mode 100644 index 0000000..8baef1b Binary files /dev/null and b/externalsort/testdata/sort/2/out.txt differ diff --git a/externalsort/testdata/sort/3/in1.txt b/externalsort/testdata/sort/3/in1.txt new file mode 100644 index 0000000..c227083 Binary files /dev/null and b/externalsort/testdata/sort/3/in1.txt differ diff --git a/externalsort/testdata/sort/3/in2.txt b/externalsort/testdata/sort/3/in2.txt new file mode 100644 index 0000000..ae7dd0b Binary files /dev/null and b/externalsort/testdata/sort/3/in2.txt differ diff --git a/externalsort/testdata/sort/3/in3.txt b/externalsort/testdata/sort/3/in3.txt new file mode 100644 index 0000000..2ceb12b Binary files /dev/null and b/externalsort/testdata/sort/3/in3.txt differ diff --git a/externalsort/testdata/sort/3/out.txt b/externalsort/testdata/sort/3/out.txt new file mode 100644 index 0000000..36fc464 Binary files /dev/null and b/externalsort/testdata/sort/3/out.txt differ diff --git a/externalsort/testdata/sort/4/in1.txt b/externalsort/testdata/sort/4/in1.txt new file mode 100644 index 0000000..e69de29 diff --git a/externalsort/testdata/sort/4/in2.txt b/externalsort/testdata/sort/4/in2.txt new file mode 100644 index 0000000..e69de29 diff --git a/externalsort/testdata/sort/4/out.txt b/externalsort/testdata/sort/4/out.txt new file mode 100644 index 0000000..e69de29 diff --git a/externalsort/testdata/sort/5/in1.txt b/externalsort/testdata/sort/5/in1.txt new file mode 100644 index 0000000..8baef1b Binary files /dev/null and b/externalsort/testdata/sort/5/in1.txt differ diff --git a/externalsort/testdata/sort/5/in2.txt b/externalsort/testdata/sort/5/in2.txt new file mode 100644 index 0000000..e69de29 diff --git a/externalsort/testdata/sort/5/in3.txt b/externalsort/testdata/sort/5/in3.txt new file mode 100644 index 0000000..8baef1b Binary files /dev/null and b/externalsort/testdata/sort/5/in3.txt differ diff --git a/externalsort/testdata/sort/5/out.txt b/externalsort/testdata/sort/5/out.txt new file mode 100644 index 0000000..5d8a556 Binary files /dev/null and b/externalsort/testdata/sort/5/out.txt differ diff --git a/externalsort/testdata/sort/6/in1.txt b/externalsort/testdata/sort/6/in1.txt new file mode 100644 index 0000000..88ae695 Binary files /dev/null and b/externalsort/testdata/sort/6/in1.txt differ diff --git a/externalsort/testdata/sort/6/in2.txt b/externalsort/testdata/sort/6/in2.txt new file mode 100644 index 0000000..99065bb Binary files /dev/null and b/externalsort/testdata/sort/6/in2.txt differ diff --git a/externalsort/testdata/sort/6/in3.txt b/externalsort/testdata/sort/6/in3.txt new file mode 100644 index 0000000..34aaebe Binary files /dev/null and b/externalsort/testdata/sort/6/in3.txt differ diff --git a/externalsort/testdata/sort/6/out.txt b/externalsort/testdata/sort/6/out.txt new file mode 100644 index 0000000..00ceb07 Binary files /dev/null and b/externalsort/testdata/sort/6/out.txt differ diff --git a/externalsort/testdata/sort/7/in1.txt b/externalsort/testdata/sort/7/in1.txt new file mode 100644 index 0000000..f29fb95 Binary files /dev/null and b/externalsort/testdata/sort/7/in1.txt differ diff --git a/externalsort/testdata/sort/7/in10.txt b/externalsort/testdata/sort/7/in10.txt new file mode 100644 index 0000000..bc342d6 Binary files /dev/null and b/externalsort/testdata/sort/7/in10.txt differ diff --git a/externalsort/testdata/sort/7/in2.txt b/externalsort/testdata/sort/7/in2.txt new file mode 100644 index 0000000..6f7a4c2 Binary files /dev/null and b/externalsort/testdata/sort/7/in2.txt differ diff --git a/externalsort/testdata/sort/7/in3.txt b/externalsort/testdata/sort/7/in3.txt new file mode 100644 index 0000000..056c1c2 Binary files /dev/null and b/externalsort/testdata/sort/7/in3.txt differ diff --git a/externalsort/testdata/sort/7/in4.txt b/externalsort/testdata/sort/7/in4.txt new file mode 100644 index 0000000..4509a9c Binary files /dev/null and b/externalsort/testdata/sort/7/in4.txt differ diff --git a/externalsort/testdata/sort/7/in5.txt b/externalsort/testdata/sort/7/in5.txt new file mode 100644 index 0000000..d10880f Binary files /dev/null and b/externalsort/testdata/sort/7/in5.txt differ diff --git a/externalsort/testdata/sort/7/in6.txt b/externalsort/testdata/sort/7/in6.txt new file mode 100644 index 0000000..c894bf6 Binary files /dev/null and b/externalsort/testdata/sort/7/in6.txt differ diff --git a/externalsort/testdata/sort/7/in7.txt b/externalsort/testdata/sort/7/in7.txt new file mode 100644 index 0000000..27c37b8 Binary files /dev/null and b/externalsort/testdata/sort/7/in7.txt differ diff --git a/externalsort/testdata/sort/7/in8.txt b/externalsort/testdata/sort/7/in8.txt new file mode 100644 index 0000000..6174927 Binary files /dev/null and b/externalsort/testdata/sort/7/in8.txt differ diff --git a/externalsort/testdata/sort/7/in9.txt b/externalsort/testdata/sort/7/in9.txt new file mode 100644 index 0000000..e69347f Binary files /dev/null and b/externalsort/testdata/sort/7/in9.txt differ diff --git a/externalsort/testdata/sort/7/out.txt b/externalsort/testdata/sort/7/out.txt new file mode 100644 index 0000000..03e6e78 Binary files /dev/null and b/externalsort/testdata/sort/7/out.txt differ