new cutup approach(es)

This commit is contained in:
nate smith 2024-04-28 01:15:10 -07:00
parent 59e4e54172
commit 2e6bf9cc2a
2 changed files with 332 additions and 143 deletions

View File

@ -2,32 +2,24 @@ package cutup
import (
"bufio"
"crypto/sha1"
"fmt"
"io"
"os"
"path"
"strings"
)
func conjPrep(phraseBuff []byte, r rune) int {
if r != ' ' {
return -1
}
const (
srcDir = "/home/vilmibm/pg_plaintext/files"
tgtDir = "/home/vilmibm/pg_plaintext/cutup"
workers = 10
)
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
// TODO configurable src/tgt dir
// TODO generalize so it's not gutenberg specific
func Cutup(ins io.Reader) {
func worker(paths <-chan string, sources chan<- string) {
// TODO generalize to n character phrase markers, write new function
phraseMarkers := map[rune]bool{
';': true,
',': true,
@ -52,91 +44,221 @@ func Cutup(ins io.Reader) {
'>': true,
}
// I want to experiment with treating prepositions and conjunctions as phrase
// markers.
for p := range paths {
f, err := os.Open(p)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to open '%s': %s\n", p, err.Error())
}
s := bufio.NewScanner(f)
// to do this i would need to check the phraseBuff when I check phraseMarkers and then split accordingly
phraseBuff := []byte{}
written := 0
inHeader := true
title := ""
sourceid := ""
s := bufio.NewScanner(ins)
phraseBuff := []byte{}
printed := false
for s.Scan() {
text := strings.TrimSpace(s.Text())
for i, r := range text {
if ok := phraseMarkers[r]; ok {
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
var of *os.File
var cleaned string
var ok bool
var asStr string
var text string
var prefix string
for s.Scan() {
text = strings.TrimSpace(s.Text())
if strings.HasPrefix(text, "*** START") {
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
title = strings.Map(rep, title)
title = strings.TrimSpace(title)
inHeader = false
continue
}
if inHeader {
continue
}
if strings.HasPrefix(text, "*** END") {
break
}
if title == "" {
fmt.Fprintf(os.Stderr, "got to cutup phase with no title: '%s'", p)
break
}
if sourceid == "" {
sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6]
prefix = sourceid + "\t"
of, err = os.Create(path.Join(tgtDir, sourceid))
if err != nil {
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
break
}
}
for i, r := range text {
if ok = phraseMarkers[r]; ok {
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
}
if !printed {
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
phraseBuff = []byte{}
} else {
asStr = string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
if !printed {
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else {
asStr := string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
}
of.Close()
if written == 0 {
// there are a bunch of empty books in gutenberg :( these are text files
// that just have start and end markers with nothing in between. nothing
// i can do about it.
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
}
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
}
}
func isAlpha(r rune) bool {
alphaChars := map[rune]bool{
'a': true,
'b': true,
'c': true,
'd': true,
'e': true,
'f': true,
'g': true,
'h': true,
'i': true,
'j': true,
'k': true,
'l': true,
'm': true,
'n': true,
'o': true,
'p': true,
'q': true,
'r': true,
's': true,
't': true,
'u': true,
'v': true,
'w': true,
'x': true,
'y': true,
'z': true,
func CutupFiles() error {
err := os.Mkdir(tgtDir, 0770)
if err != nil {
return err
}
lookup := strings.ToLower(string(r))
return alphaChars[rune(lookup[0])]
dir, err := os.Open(srcDir)
if err != nil {
return fmt.Errorf("could not open %s: %w", srcDir, err)
}
entries, err := dir.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read %s: %w", srcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < workers; x++ {
go worker(paths, sources)
}
for _, e := range entries {
paths <- path.Join(srcDir, e)
}
close(paths)
ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv"))
if err != nil {
return fmt.Errorf("could not open index file: %w", err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}
func conjPrep(phraseBuff []byte, r rune) int {
if r != ' ' {
return -1
}
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
func isAlpha(r rune) bool {
// TODO use rune numerical ranges for this
switch strings.ToLower(string(r)) {
case "a":
return true
case "b":
return true
case "c":
return true
case "d":
return true
case "e":
return true
case "f":
return true
case "g":
return true
case "h":
return true
case "i":
return true
case "j":
return true
case "k":
return true
case "l":
return true
case "m":
return true
case "n":
return true
case "o":
return true
case "p":
return true
case "q":
return true
case "r":
return true
case "s":
return true
case "t":
return true
case "u":
return true
case "v":
return true
case "w":
return true
case "x":
return true
case "y":
return true
case "z":
return true
}
return false
}
func alphaPercent(s string) float64 {
@ -153,19 +275,61 @@ func alphaPercent(s string) float64 {
return 100 * (alpha / total)
}
func rep(r rune) (s rune) {
s = r
switch s {
case '':
return '\''
case '“':
return '"'
case '”':
return '"'
case '"':
return -1
case '(':
return -1
case '[':
return -1
case '{':
return -1
case '<':
return -1
case '_':
return -1
case '*':
return -1
case '\r':
return -1
case '\t':
return -1
case '\n': // should not need this but stray \n ending up in output...
return -1
case 0x1c:
return -1
case 0x19:
return -1
case 0x01:
return -1
case 0x0f:
return -1
case 0x00:
return -1
case 0xb0:
return -1
case 0x1b:
return -1
case '\\':
return '/'
}
return
}
func clean(bs []byte) string {
s := string(bs)
s = strings.ReplaceAll(s, "", "'")
s = strings.ReplaceAll(s, "\"", "")
s = strings.ReplaceAll(s, "(", "")
s = strings.ReplaceAll(s, "[", "")
s = strings.ReplaceAll(s, "{", "")
s = strings.ReplaceAll(s, "<", "")
s = strings.ReplaceAll(s, "_", "")
s = strings.ReplaceAll(s, "*", "")
s = strings.TrimLeft(s, "'\"")
s = strings.TrimSpace(s)
s = strings.ToLower(s)
s := strings.ToLower(
strings.TrimSpace(
strings.TrimRight(
strings.TrimLeft(
strings.Map(rep, strings.ToValidUTF8(string(bs), "")), "'\""), "'\"")))
if alphaPercent(s) < 50.0 {
return ""

View File

@ -2,63 +2,88 @@ package ingest
import (
"bufio"
"database/sql"
"context"
"fmt"
"io"
"os"
"path"
"strings"
_ "github.com/mattn/go-sqlite3"
"github.com/vilmibm/trunkless/db"
)
const dsn = "phrase.db?_journal=OFF"
const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
func createSource(db *sql.DB, sourceName string) (int64, error) {
stmt, err := db.Prepare("INSERT INTO sources (name) VALUES (?) ON CONFLICT DO NOTHING RETURNING id")
if err != nil {
return -1, err
}
// TODO
// - [X] finalize gutenberg ingestion
// - [ ] clean up commands
// - [ ] clean up repo
// - [ ] push and deploy to town with new pg db
// - [ ] gamefaqs extraction
// - [ ] corpus selector
// - [ ] deploy to town
// - [ ] geocities
// - [ ] blog post
// - [ ] launch
result, err := stmt.Exec(sourceName)
if err != nil {
return -1, err
}
defer stmt.Close()
return result.LastInsertId()
}
func Ingest(sourceName string, ins io.Reader) error {
db, err := sql.Open("sqlite3", dsn)
func IngestGut() error {
conn, err := db.Connect()
if err != nil {
return err
}
defer conn.Close(context.Background())
defer db.Close()
s := bufio.NewScanner(ins)
sourceID, err := createSource(db, sourceName)
dir, err := os.Open(cutupDir)
if err != nil {
return fmt.Errorf("could not make source: %w", err)
return fmt.Errorf("could not open %s: %w", cutupDir, err)
}
tx, err := db.Begin()
// echo gutenberg | sha1sum | head -c7
corpusid := "cb20c3e"
_, err = conn.Exec(context.Background(), "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", corpusid, "gutenberg")
if err != nil {
return fmt.Errorf("failed to create transaction: %w", err)
return fmt.Errorf("failed to create gutenberg corpus: %w", err)
}
stmt, err := tx.Prepare("INSERT INTO phrases (sourceid, phrase) VALUES (?, ?) ON CONFLICT DO NOTHING")
defer stmt.Close()
entries, err := dir.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read %s: %w", cutupDir, err)
}
idx, err := os.Open(path.Join(cutupDir, "_title_index.tsv"))
if err != nil {
return fmt.Errorf("failed to open source index: %w", err)
}
tx, err := conn.Begin(context.Background())
if err != nil {
return fmt.Errorf("could not open transaction: %w", err)
}
s := bufio.NewScanner(idx)
for s.Scan() {
phrase := s.Text()
if err != nil {
return err
line := s.Text()
parts := strings.SplitN(line, " ", 2)
if len(parts) != 2 {
return fmt.Errorf("malformed line in sourceMap: %s", line)
}
_, err = tx.Exec(context.Background(),
"INSERT INTO sources (id, corpusid, name) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
parts[0], corpusid, parts[1])
}
if _, err = stmt.Exec(sourceID, phrase); err != nil {
return fmt.Errorf("could not insert phrase '%s' for source '%d': %w", phrase, sourceID, err)
tx.Commit(context.Background())
for _, e := range entries {
if strings.HasPrefix(e, "_") {
continue
}
p := path.Join(cutupDir, e)
sql := fmt.Sprintf("COPY phrases(sourceid, phrase) FROM '%s'", p)
_, err = conn.Exec(context.Background(), sql)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error())
}
}
return tx.Commit()
return nil
}