new cutup approach(es)
This commit is contained in:
parent
59e4e54172
commit
2e6bf9cc2a
376
cutup/cutup.go
376
cutup/cutup.go
@ -2,32 +2,24 @@ package cutup
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"crypto/sha1"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func conjPrep(phraseBuff []byte, r rune) int {
|
||||
if r != ' ' {
|
||||
return -1
|
||||
}
|
||||
const (
|
||||
srcDir = "/home/vilmibm/pg_plaintext/files"
|
||||
tgtDir = "/home/vilmibm/pg_plaintext/cutup"
|
||||
workers = 10
|
||||
)
|
||||
|
||||
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
|
||||
maxLen := 8 // TODO magic number based on longest suffix
|
||||
offset := len(phraseBuff) - maxLen
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
end := string(phraseBuff[offset:])
|
||||
for _, s := range suffices {
|
||||
if strings.HasSuffix(end, " "+s) {
|
||||
return len(s)
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
// TODO configurable src/tgt dir
|
||||
// TODO generalize so it's not gutenberg specific
|
||||
|
||||
func Cutup(ins io.Reader) {
|
||||
func worker(paths <-chan string, sources chan<- string) {
|
||||
// TODO generalize to n character phrase markers, write new function
|
||||
phraseMarkers := map[rune]bool{
|
||||
';': true,
|
||||
',': true,
|
||||
@ -52,91 +44,221 @@ func Cutup(ins io.Reader) {
|
||||
'>': true,
|
||||
}
|
||||
|
||||
// I want to experiment with treating prepositions and conjunctions as phrase
|
||||
// markers.
|
||||
for p := range paths {
|
||||
f, err := os.Open(p)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "failed to open '%s': %s\n", p, err.Error())
|
||||
}
|
||||
s := bufio.NewScanner(f)
|
||||
|
||||
// to do this i would need to check the phraseBuff when I check phraseMarkers and then split accordingly
|
||||
phraseBuff := []byte{}
|
||||
written := 0
|
||||
inHeader := true
|
||||
title := ""
|
||||
sourceid := ""
|
||||
|
||||
s := bufio.NewScanner(ins)
|
||||
phraseBuff := []byte{}
|
||||
printed := false
|
||||
for s.Scan() {
|
||||
text := strings.TrimSpace(s.Text())
|
||||
for i, r := range text {
|
||||
if ok := phraseMarkers[r]; ok {
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned := clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
fmt.Println(cleaned)
|
||||
printed = true
|
||||
var of *os.File
|
||||
var cleaned string
|
||||
var ok bool
|
||||
var asStr string
|
||||
var text string
|
||||
var prefix string
|
||||
|
||||
for s.Scan() {
|
||||
text = strings.TrimSpace(s.Text())
|
||||
if strings.HasPrefix(text, "*** START") {
|
||||
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
|
||||
title, _ = strings.CutPrefix(title, " EBOOK")
|
||||
title = strings.Map(rep, title)
|
||||
title = strings.TrimSpace(title)
|
||||
inHeader = false
|
||||
continue
|
||||
}
|
||||
if inHeader {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(text, "*** END") {
|
||||
break
|
||||
}
|
||||
if title == "" {
|
||||
fmt.Fprintf(os.Stderr, "got to cutup phase with no title: '%s'", p)
|
||||
break
|
||||
}
|
||||
if sourceid == "" {
|
||||
sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6]
|
||||
prefix = sourceid + "\t"
|
||||
of, err = os.Create(path.Join(tgtDir, sourceid))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, r := range text {
|
||||
if ok = phraseMarkers[r]; ok {
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned = clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
fmt.Fprintln(of, prefix+cleaned)
|
||||
written++
|
||||
}
|
||||
}
|
||||
}
|
||||
if !printed {
|
||||
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
|
||||
}
|
||||
printed = false
|
||||
phraseBuff = []byte{}
|
||||
} else if v := conjPrep(phraseBuff, r); v > 0 {
|
||||
// TODO erase or keep? starting with erase.
|
||||
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
|
||||
// TODO this pasta is copied
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned := clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
fmt.Println(cleaned)
|
||||
printed = true
|
||||
phraseBuff = []byte{}
|
||||
} else if v := conjPrep(phraseBuff, r); v > 0 {
|
||||
// TODO erase or keep? starting with erase.
|
||||
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
|
||||
// TODO this pasta is copied
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned = clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
fmt.Fprintln(of, prefix+cleaned)
|
||||
written++
|
||||
}
|
||||
}
|
||||
phraseBuff = []byte{}
|
||||
} else {
|
||||
asStr = string(phraseBuff)
|
||||
if r == ' ' && strings.HasSuffix(asStr, " ") {
|
||||
continue
|
||||
}
|
||||
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
|
||||
phraseBuff = append(phraseBuff, byte(' '))
|
||||
}
|
||||
phraseBuff = append(phraseBuff, byte(r))
|
||||
}
|
||||
if !printed {
|
||||
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
|
||||
}
|
||||
printed = false
|
||||
phraseBuff = []byte{}
|
||||
} else {
|
||||
asStr := string(phraseBuff)
|
||||
if r == ' ' && strings.HasSuffix(asStr, " ") {
|
||||
continue
|
||||
}
|
||||
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
|
||||
phraseBuff = append(phraseBuff, byte(' '))
|
||||
}
|
||||
phraseBuff = append(phraseBuff, byte(r))
|
||||
}
|
||||
}
|
||||
of.Close()
|
||||
if written == 0 {
|
||||
// there are a bunch of empty books in gutenberg :( these are text files
|
||||
// that just have start and end markers with nothing in between. nothing
|
||||
// i can do about it.
|
||||
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
|
||||
}
|
||||
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
|
||||
}
|
||||
}
|
||||
|
||||
func isAlpha(r rune) bool {
|
||||
alphaChars := map[rune]bool{
|
||||
'a': true,
|
||||
'b': true,
|
||||
'c': true,
|
||||
'd': true,
|
||||
'e': true,
|
||||
'f': true,
|
||||
'g': true,
|
||||
'h': true,
|
||||
'i': true,
|
||||
'j': true,
|
||||
'k': true,
|
||||
'l': true,
|
||||
'm': true,
|
||||
'n': true,
|
||||
'o': true,
|
||||
'p': true,
|
||||
'q': true,
|
||||
'r': true,
|
||||
's': true,
|
||||
't': true,
|
||||
'u': true,
|
||||
'v': true,
|
||||
'w': true,
|
||||
'x': true,
|
||||
'y': true,
|
||||
'z': true,
|
||||
func CutupFiles() error {
|
||||
err := os.Mkdir(tgtDir, 0770)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
lookup := strings.ToLower(string(r))
|
||||
return alphaChars[rune(lookup[0])]
|
||||
|
||||
dir, err := os.Open(srcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open %s: %w", srcDir, err)
|
||||
}
|
||||
entries, err := dir.Readdirnames(-1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not read %s: %w", srcDir, err)
|
||||
}
|
||||
|
||||
paths := make(chan string, len(entries))
|
||||
sources := make(chan string, len(entries))
|
||||
|
||||
for x := 0; x < workers; x++ {
|
||||
go worker(paths, sources)
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
paths <- path.Join(srcDir, e)
|
||||
}
|
||||
close(paths)
|
||||
|
||||
ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open index file: %w", err)
|
||||
}
|
||||
defer ixFile.Close()
|
||||
|
||||
for i := 0; i < len(entries); i++ {
|
||||
l := <-sources
|
||||
fmt.Printf("%d/%d\r", i+1, len(entries))
|
||||
fmt.Fprintln(ixFile, l)
|
||||
}
|
||||
close(sources)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func conjPrep(phraseBuff []byte, r rune) int {
|
||||
if r != ' ' {
|
||||
return -1
|
||||
}
|
||||
|
||||
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
|
||||
maxLen := 8 // TODO magic number based on longest suffix
|
||||
offset := len(phraseBuff) - maxLen
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
end := string(phraseBuff[offset:])
|
||||
for _, s := range suffices {
|
||||
if strings.HasSuffix(end, " "+s) {
|
||||
return len(s)
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func isAlpha(r rune) bool {
|
||||
// TODO use rune numerical ranges for this
|
||||
switch strings.ToLower(string(r)) {
|
||||
case "a":
|
||||
return true
|
||||
case "b":
|
||||
return true
|
||||
case "c":
|
||||
return true
|
||||
case "d":
|
||||
return true
|
||||
case "e":
|
||||
return true
|
||||
case "f":
|
||||
return true
|
||||
case "g":
|
||||
return true
|
||||
case "h":
|
||||
return true
|
||||
case "i":
|
||||
return true
|
||||
case "j":
|
||||
return true
|
||||
case "k":
|
||||
return true
|
||||
case "l":
|
||||
return true
|
||||
case "m":
|
||||
return true
|
||||
case "n":
|
||||
return true
|
||||
case "o":
|
||||
return true
|
||||
case "p":
|
||||
return true
|
||||
case "q":
|
||||
return true
|
||||
case "r":
|
||||
return true
|
||||
case "s":
|
||||
return true
|
||||
case "t":
|
||||
return true
|
||||
case "u":
|
||||
return true
|
||||
case "v":
|
||||
return true
|
||||
case "w":
|
||||
return true
|
||||
case "x":
|
||||
return true
|
||||
case "y":
|
||||
return true
|
||||
case "z":
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func alphaPercent(s string) float64 {
|
||||
@ -153,19 +275,61 @@ func alphaPercent(s string) float64 {
|
||||
return 100 * (alpha / total)
|
||||
}
|
||||
|
||||
func rep(r rune) (s rune) {
|
||||
s = r
|
||||
switch s {
|
||||
case '’':
|
||||
return '\''
|
||||
case '“':
|
||||
return '"'
|
||||
case '”':
|
||||
return '"'
|
||||
case '"':
|
||||
return -1
|
||||
case '(':
|
||||
return -1
|
||||
case '[':
|
||||
return -1
|
||||
case '{':
|
||||
return -1
|
||||
case '<':
|
||||
return -1
|
||||
case '_':
|
||||
return -1
|
||||
case '*':
|
||||
return -1
|
||||
case '\r':
|
||||
return -1
|
||||
case '\t':
|
||||
return -1
|
||||
case '\n': // should not need this but stray \n ending up in output...
|
||||
return -1
|
||||
case 0x1c:
|
||||
return -1
|
||||
case 0x19:
|
||||
return -1
|
||||
case 0x01:
|
||||
return -1
|
||||
case 0x0f:
|
||||
return -1
|
||||
case 0x00:
|
||||
return -1
|
||||
case 0xb0:
|
||||
return -1
|
||||
case 0x1b:
|
||||
return -1
|
||||
case '\\':
|
||||
return '/'
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func clean(bs []byte) string {
|
||||
s := string(bs)
|
||||
s = strings.ReplaceAll(s, "’", "'")
|
||||
s = strings.ReplaceAll(s, "\"", "")
|
||||
s = strings.ReplaceAll(s, "(", "")
|
||||
s = strings.ReplaceAll(s, "[", "")
|
||||
s = strings.ReplaceAll(s, "{", "")
|
||||
s = strings.ReplaceAll(s, "<", "")
|
||||
s = strings.ReplaceAll(s, "_", "")
|
||||
s = strings.ReplaceAll(s, "*", "")
|
||||
s = strings.TrimLeft(s, "'\"")
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ToLower(s)
|
||||
s := strings.ToLower(
|
||||
strings.TrimSpace(
|
||||
strings.TrimRight(
|
||||
strings.TrimLeft(
|
||||
strings.Map(rep, strings.ToValidUTF8(string(bs), "")), "'\""), "'\"")))
|
||||
|
||||
if alphaPercent(s) < 50.0 {
|
||||
return ""
|
||||
|
@ -2,63 +2,88 @@ package ingest
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"github.com/vilmibm/trunkless/db"
|
||||
)
|
||||
|
||||
const dsn = "phrase.db?_journal=OFF"
|
||||
const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
|
||||
|
||||
func createSource(db *sql.DB, sourceName string) (int64, error) {
|
||||
stmt, err := db.Prepare("INSERT INTO sources (name) VALUES (?) ON CONFLICT DO NOTHING RETURNING id")
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
// TODO
|
||||
// - [X] finalize gutenberg ingestion
|
||||
// - [ ] clean up commands
|
||||
// - [ ] clean up repo
|
||||
// - [ ] push and deploy to town with new pg db
|
||||
// - [ ] gamefaqs extraction
|
||||
// - [ ] corpus selector
|
||||
// - [ ] deploy to town
|
||||
// - [ ] geocities
|
||||
// - [ ] blog post
|
||||
// - [ ] launch
|
||||
|
||||
result, err := stmt.Exec(sourceName)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
defer stmt.Close()
|
||||
|
||||
return result.LastInsertId()
|
||||
}
|
||||
|
||||
func Ingest(sourceName string, ins io.Reader) error {
|
||||
db, err := sql.Open("sqlite3", dsn)
|
||||
func IngestGut() error {
|
||||
conn, err := db.Connect()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Close(context.Background())
|
||||
|
||||
defer db.Close()
|
||||
|
||||
s := bufio.NewScanner(ins)
|
||||
|
||||
sourceID, err := createSource(db, sourceName)
|
||||
dir, err := os.Open(cutupDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not make source: %w", err)
|
||||
return fmt.Errorf("could not open %s: %w", cutupDir, err)
|
||||
}
|
||||
|
||||
tx, err := db.Begin()
|
||||
// echo gutenberg | sha1sum | head -c7
|
||||
corpusid := "cb20c3e"
|
||||
_, err = conn.Exec(context.Background(), "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", corpusid, "gutenberg")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create transaction: %w", err)
|
||||
return fmt.Errorf("failed to create gutenberg corpus: %w", err)
|
||||
}
|
||||
|
||||
stmt, err := tx.Prepare("INSERT INTO phrases (sourceid, phrase) VALUES (?, ?) ON CONFLICT DO NOTHING")
|
||||
defer stmt.Close()
|
||||
entries, err := dir.Readdirnames(-1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not read %s: %w", cutupDir, err)
|
||||
}
|
||||
|
||||
idx, err := os.Open(path.Join(cutupDir, "_title_index.tsv"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open source index: %w", err)
|
||||
}
|
||||
|
||||
tx, err := conn.Begin(context.Background())
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open transaction: %w", err)
|
||||
}
|
||||
|
||||
s := bufio.NewScanner(idx)
|
||||
for s.Scan() {
|
||||
phrase := s.Text()
|
||||
if err != nil {
|
||||
return err
|
||||
line := s.Text()
|
||||
parts := strings.SplitN(line, " ", 2)
|
||||
if len(parts) != 2 {
|
||||
return fmt.Errorf("malformed line in sourceMap: %s", line)
|
||||
}
|
||||
_, err = tx.Exec(context.Background(),
|
||||
"INSERT INTO sources (id, corpusid, name) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
|
||||
parts[0], corpusid, parts[1])
|
||||
}
|
||||
|
||||
if _, err = stmt.Exec(sourceID, phrase); err != nil {
|
||||
return fmt.Errorf("could not insert phrase '%s' for source '%d': %w", phrase, sourceID, err)
|
||||
tx.Commit(context.Background())
|
||||
|
||||
for _, e := range entries {
|
||||
if strings.HasPrefix(e, "_") {
|
||||
continue
|
||||
}
|
||||
p := path.Join(cutupDir, e)
|
||||
sql := fmt.Sprintf("COPY phrases(sourceid, phrase) FROM '%s'", p)
|
||||
_, err = conn.Exec(context.Background(), sql)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit()
|
||||
return nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user