shad-go/externalsort/sort.go

144 lines
2.8 KiB
Go
Raw Permalink Normal View History

2022-02-10 22:06:57 +00:00
//go:build !solution
package externalsort
import (
2024-06-06 23:21:38 +00:00
"bufio"
"container/heap"
"io"
2024-06-06 23:21:38 +00:00
"os"
"sort"
"strings"
)
2024-06-06 23:21:38 +00:00
// lineReader is simply a wrapper around bufffered reader
type lineReader struct {
br bufio.Reader
}
func (lr *lineReader) ReadLine() (string, error) {
s, err := lr.br.ReadString('\n')
if err != nil {
return s, err
}
s = s[:len(s)-1]
return s, err
}
type lineWriter struct {
w io.Writer
}
func (lw *lineWriter) Write(l string) error {
_, err := lw.w.Write(append([]byte(l), '\n'))
return err
}
func NewReader(r io.Reader) LineReader {
2024-06-06 23:21:38 +00:00
return &lineReader{*bufio.NewReader(r)}
}
func NewWriter(w io.Writer) LineWriter {
2024-06-06 23:21:38 +00:00
return &lineWriter{w}
}
// item for the reader heap
// stores reader to get the next line from,
// the line from reader which is used for sorting the heap
// lastLine bool which determines whether the item has no more lines to read
// and it can be safely removed from the heap
type readerHeapItem struct {
reader *LineReader
line string
lastLine bool
}
type readerHeap []readerHeapItem
func (h readerHeap) Len() int {
return len(h)
}
func (h readerHeap) Less(i, j int) bool {
return h[i].line < h[j].line
}
func (h readerHeap) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}
func (h *readerHeap) Push(x any) {
*h = append(*h, x.(readerHeapItem))
}
func (h *readerHeap) Pop() any {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// push reader back to the heap if it still has lines
// and update its lastLine value
// otherwise do nothing
// returns any errors that occured while reading (expect io.EOF)
func pushIfNotLast(h *readerHeap, reader *LineReader) error {
line, err := (*reader).ReadLine()
if err != nil {
if err != io.EOF {
return err
}
heap.Push(h, readerHeapItem{reader, line, true})
return nil
}
heap.Push(h, readerHeapItem{reader, line, false})
return nil
}
func Merge(w LineWriter, readers ...LineReader) error {
2024-06-06 23:21:38 +00:00
h := make(readerHeap, 0, len(readers))
heap.Init(&h)
// push initial values to heap
for _, reader := range readers {
err := pushIfNotLast(&h, &reader)
if err != nil {
return err
}
}
// write lines from the top of the heap while
// the heap is not empty
for len(h) > 0 {
it := heap.Pop(&h).(readerHeapItem)
w.Write(it.line)
if it.lastLine {
continue
}
err := pushIfNotLast(&h, it.reader)
if err != nil {
return err
}
}
return nil
}
func Sort(w io.Writer, in ...string) error {
2024-06-06 23:21:38 +00:00
readers := make([]LineReader, 0, len(in))
for _, fn := range in {
b, err := os.ReadFile(fn)
if err != nil {
return err
}
s := strings.TrimSuffix(string(b), "\n")
// do not sort empty files
if len(s) == 0 {
continue
}
lines := strings.Split(s, "\n")
sort.Strings(lines)
s = strings.Join(lines, "\n")
readers = append(readers, NewReader(strings.NewReader(s)))
}
return Merge(NewWriter(w), readers...)
}