From 2e6bf9cc2a8aa3bb9d213cbce32a05cce9c8a3c6 Mon Sep 17 00:00:00 2001 From: nate smith Date: Sun, 28 Apr 2024 01:15:10 -0700 Subject: [PATCH] new cutup approach(es) --- cutup/cutup.go | 376 ++++++++++++++++++++++++++++++++++------------- ingest/ingest.go | 99 ++++++++----- 2 files changed, 332 insertions(+), 143 deletions(-) diff --git a/cutup/cutup.go b/cutup/cutup.go index a1e568b..bfd7d39 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -2,32 +2,24 @@ package cutup import ( "bufio" + "crypto/sha1" "fmt" - "io" + "os" + "path" "strings" ) -func conjPrep(phraseBuff []byte, r rune) int { - if r != ' ' { - return -1 - } +const ( + srcDir = "/home/vilmibm/pg_plaintext/files" + tgtDir = "/home/vilmibm/pg_plaintext/cutup" + workers = 10 +) - suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"} - maxLen := 8 // TODO magic number based on longest suffix - offset := len(phraseBuff) - maxLen - if offset < 0 { - offset = 0 - } - end := string(phraseBuff[offset:]) - for _, s := range suffices { - if strings.HasSuffix(end, " "+s) { - return len(s) - } - } - return -1 -} +// TODO configurable src/tgt dir +// TODO generalize so it's not gutenberg specific -func Cutup(ins io.Reader) { +func worker(paths <-chan string, sources chan<- string) { + // TODO generalize to n character phrase markers, write new function phraseMarkers := map[rune]bool{ ';': true, ',': true, @@ -52,91 +44,221 @@ func Cutup(ins io.Reader) { '>': true, } - // I want to experiment with treating prepositions and conjunctions as phrase - // markers. + for p := range paths { + f, err := os.Open(p) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to open '%s': %s\n", p, err.Error()) + } + s := bufio.NewScanner(f) - // to do this i would need to check the phraseBuff when I check phraseMarkers and then split accordingly + phraseBuff := []byte{} + written := 0 + inHeader := true + title := "" + sourceid := "" - s := bufio.NewScanner(ins) - phraseBuff := []byte{} - printed := false - for s.Scan() { - text := strings.TrimSpace(s.Text()) - for i, r := range text { - if ok := phraseMarkers[r]; ok { - if len(phraseBuff) >= 10 { - cleaned := clean(phraseBuff) - if len(cleaned) > 0 { - fmt.Println(cleaned) - printed = true + var of *os.File + var cleaned string + var ok bool + var asStr string + var text string + var prefix string + + for s.Scan() { + text = strings.TrimSpace(s.Text()) + if strings.HasPrefix(text, "*** START") { + title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG") + title, _ = strings.CutPrefix(title, " EBOOK") + title = strings.Map(rep, title) + title = strings.TrimSpace(title) + inHeader = false + continue + } + if inHeader { + continue + } + if strings.HasPrefix(text, "*** END") { + break + } + if title == "" { + fmt.Fprintf(os.Stderr, "got to cutup phase with no title: '%s'", p) + break + } + if sourceid == "" { + sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6] + prefix = sourceid + "\t" + of, err = os.Create(path.Join(tgtDir, sourceid)) + if err != nil { + fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error()) + break + } + } + for i, r := range text { + if ok = phraseMarkers[r]; ok { + if len(phraseBuff) >= 10 { + cleaned = clean(phraseBuff) + if len(cleaned) > 0 { + fmt.Fprintln(of, prefix+cleaned) + written++ + } } - } - if !printed { - //fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff)) - } - printed = false - phraseBuff = []byte{} - } else if v := conjPrep(phraseBuff, r); v > 0 { - // TODO erase or keep? starting with erase. - phraseBuff = phraseBuff[0 : len(phraseBuff)-v] - // TODO this pasta is copied - if len(phraseBuff) >= 10 { - cleaned := clean(phraseBuff) - if len(cleaned) > 0 { - fmt.Println(cleaned) - printed = true + phraseBuff = []byte{} + } else if v := conjPrep(phraseBuff, r); v > 0 { + // TODO erase or keep? starting with erase. + phraseBuff = phraseBuff[0 : len(phraseBuff)-v] + // TODO this pasta is copied + if len(phraseBuff) >= 10 { + cleaned = clean(phraseBuff) + if len(cleaned) > 0 { + fmt.Fprintln(of, prefix+cleaned) + written++ + } } + phraseBuff = []byte{} + } else { + asStr = string(phraseBuff) + if r == ' ' && strings.HasSuffix(asStr, " ") { + continue + } + if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' { + phraseBuff = append(phraseBuff, byte(' ')) + } + phraseBuff = append(phraseBuff, byte(r)) } - if !printed { - //fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff)) - } - printed = false - phraseBuff = []byte{} - } else { - asStr := string(phraseBuff) - if r == ' ' && strings.HasSuffix(asStr, " ") { - continue - } - if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' { - phraseBuff = append(phraseBuff, byte(' ')) - } - phraseBuff = append(phraseBuff, byte(r)) } } + of.Close() + if written == 0 { + // there are a bunch of empty books in gutenberg :( these are text files + // that just have start and end markers with nothing in between. nothing + // i can do about it. + fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p) + } + sources <- fmt.Sprintf("%s\t%s", sourceid, title) } } -func isAlpha(r rune) bool { - alphaChars := map[rune]bool{ - 'a': true, - 'b': true, - 'c': true, - 'd': true, - 'e': true, - 'f': true, - 'g': true, - 'h': true, - 'i': true, - 'j': true, - 'k': true, - 'l': true, - 'm': true, - 'n': true, - 'o': true, - 'p': true, - 'q': true, - 'r': true, - 's': true, - 't': true, - 'u': true, - 'v': true, - 'w': true, - 'x': true, - 'y': true, - 'z': true, +func CutupFiles() error { + err := os.Mkdir(tgtDir, 0770) + if err != nil { + return err } - lookup := strings.ToLower(string(r)) - return alphaChars[rune(lookup[0])] + + dir, err := os.Open(srcDir) + if err != nil { + return fmt.Errorf("could not open %s: %w", srcDir, err) + } + entries, err := dir.Readdirnames(-1) + if err != nil { + return fmt.Errorf("could not read %s: %w", srcDir, err) + } + + paths := make(chan string, len(entries)) + sources := make(chan string, len(entries)) + + for x := 0; x < workers; x++ { + go worker(paths, sources) + } + + for _, e := range entries { + paths <- path.Join(srcDir, e) + } + close(paths) + + ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv")) + if err != nil { + return fmt.Errorf("could not open index file: %w", err) + } + defer ixFile.Close() + + for i := 0; i < len(entries); i++ { + l := <-sources + fmt.Printf("%d/%d\r", i+1, len(entries)) + fmt.Fprintln(ixFile, l) + } + close(sources) + + return nil +} + +func conjPrep(phraseBuff []byte, r rune) int { + if r != ' ' { + return -1 + } + + suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"} + maxLen := 8 // TODO magic number based on longest suffix + offset := len(phraseBuff) - maxLen + if offset < 0 { + offset = 0 + } + end := string(phraseBuff[offset:]) + for _, s := range suffices { + if strings.HasSuffix(end, " "+s) { + return len(s) + } + } + return -1 +} + +func isAlpha(r rune) bool { + // TODO use rune numerical ranges for this + switch strings.ToLower(string(r)) { + case "a": + return true + case "b": + return true + case "c": + return true + case "d": + return true + case "e": + return true + case "f": + return true + case "g": + return true + case "h": + return true + case "i": + return true + case "j": + return true + case "k": + return true + case "l": + return true + case "m": + return true + case "n": + return true + case "o": + return true + case "p": + return true + case "q": + return true + case "r": + return true + case "s": + return true + case "t": + return true + case "u": + return true + case "v": + return true + case "w": + return true + case "x": + return true + case "y": + return true + case "z": + return true + } + + return false } func alphaPercent(s string) float64 { @@ -153,19 +275,61 @@ func alphaPercent(s string) float64 { return 100 * (alpha / total) } +func rep(r rune) (s rune) { + s = r + switch s { + case '’': + return '\'' + case '“': + return '"' + case '”': + return '"' + case '"': + return -1 + case '(': + return -1 + case '[': + return -1 + case '{': + return -1 + case '<': + return -1 + case '_': + return -1 + case '*': + return -1 + case '\r': + return -1 + case '\t': + return -1 + case '\n': // should not need this but stray \n ending up in output... + return -1 + case 0x1c: + return -1 + case 0x19: + return -1 + case 0x01: + return -1 + case 0x0f: + return -1 + case 0x00: + return -1 + case 0xb0: + return -1 + case 0x1b: + return -1 + case '\\': + return '/' + } + return +} + func clean(bs []byte) string { - s := string(bs) - s = strings.ReplaceAll(s, "’", "'") - s = strings.ReplaceAll(s, "\"", "") - s = strings.ReplaceAll(s, "(", "") - s = strings.ReplaceAll(s, "[", "") - s = strings.ReplaceAll(s, "{", "") - s = strings.ReplaceAll(s, "<", "") - s = strings.ReplaceAll(s, "_", "") - s = strings.ReplaceAll(s, "*", "") - s = strings.TrimLeft(s, "'\"") - s = strings.TrimSpace(s) - s = strings.ToLower(s) + s := strings.ToLower( + strings.TrimSpace( + strings.TrimRight( + strings.TrimLeft( + strings.Map(rep, strings.ToValidUTF8(string(bs), "")), "'\""), "'\""))) if alphaPercent(s) < 50.0 { return "" diff --git a/ingest/ingest.go b/ingest/ingest.go index a6b3a49..ffff006 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -2,63 +2,88 @@ package ingest import ( "bufio" - "database/sql" + "context" "fmt" - "io" + "os" + "path" + "strings" - _ "github.com/mattn/go-sqlite3" + "github.com/vilmibm/trunkless/db" ) -const dsn = "phrase.db?_journal=OFF" +const cutupDir = "/home/vilmibm/pg_plaintext/cutup" -func createSource(db *sql.DB, sourceName string) (int64, error) { - stmt, err := db.Prepare("INSERT INTO sources (name) VALUES (?) ON CONFLICT DO NOTHING RETURNING id") - if err != nil { - return -1, err - } +// TODO +// - [X] finalize gutenberg ingestion +// - [ ] clean up commands +// - [ ] clean up repo +// - [ ] push and deploy to town with new pg db +// - [ ] gamefaqs extraction +// - [ ] corpus selector +// - [ ] deploy to town +// - [ ] geocities +// - [ ] blog post +// - [ ] launch - result, err := stmt.Exec(sourceName) - if err != nil { - return -1, err - } - - defer stmt.Close() - - return result.LastInsertId() -} - -func Ingest(sourceName string, ins io.Reader) error { - db, err := sql.Open("sqlite3", dsn) +func IngestGut() error { + conn, err := db.Connect() if err != nil { return err } + defer conn.Close(context.Background()) - defer db.Close() - - s := bufio.NewScanner(ins) - - sourceID, err := createSource(db, sourceName) + dir, err := os.Open(cutupDir) if err != nil { - return fmt.Errorf("could not make source: %w", err) + return fmt.Errorf("could not open %s: %w", cutupDir, err) } - tx, err := db.Begin() + // echo gutenberg | sha1sum | head -c7 + corpusid := "cb20c3e" + _, err = conn.Exec(context.Background(), "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", corpusid, "gutenberg") if err != nil { - return fmt.Errorf("failed to create transaction: %w", err) + return fmt.Errorf("failed to create gutenberg corpus: %w", err) } - stmt, err := tx.Prepare("INSERT INTO phrases (sourceid, phrase) VALUES (?, ?) ON CONFLICT DO NOTHING") - defer stmt.Close() + entries, err := dir.Readdirnames(-1) + if err != nil { + return fmt.Errorf("could not read %s: %w", cutupDir, err) + } + + idx, err := os.Open(path.Join(cutupDir, "_title_index.tsv")) + if err != nil { + return fmt.Errorf("failed to open source index: %w", err) + } + + tx, err := conn.Begin(context.Background()) + if err != nil { + return fmt.Errorf("could not open transaction: %w", err) + } + + s := bufio.NewScanner(idx) for s.Scan() { - phrase := s.Text() - if err != nil { - return err + line := s.Text() + parts := strings.SplitN(line, " ", 2) + if len(parts) != 2 { + return fmt.Errorf("malformed line in sourceMap: %s", line) } + _, err = tx.Exec(context.Background(), + "INSERT INTO sources (id, corpusid, name) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING", + parts[0], corpusid, parts[1]) + } - if _, err = stmt.Exec(sourceID, phrase); err != nil { - return fmt.Errorf("could not insert phrase '%s' for source '%d': %w", phrase, sourceID, err) + tx.Commit(context.Background()) + + for _, e := range entries { + if strings.HasPrefix(e, "_") { + continue + } + p := path.Join(cutupDir, e) + sql := fmt.Sprintf("COPY phrases(sourceid, phrase) FROM '%s'", p) + _, err = conn.Exec(context.Background(), sql) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error()) } } - return tx.Commit() + return nil }