commit 03348a427da275465134b34322386694896eb905 Author: vilmibm Date: Fri Jul 14 04:47:20 2023 +0000 initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..41ca591 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# gutchunk + +this is code I used to process about 12gb of plaintext books from Project Gutenberg. + +it assumes a mirror of gutenberg books made using their [robot access](https://gutenberg.org/policy/robot_access.html) endpoint. + +so far the output of this is being used on [blackout](https://blackout.tilde.town). + +if you're a townie and want access to the database i made using this lmk. + diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..e4614d0 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module git.tilde.town/gutchunker + +go 1.18 + +require github.com/mattn/go-sqlite3 v1.14.17 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..04784a2 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM= +github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= diff --git a/main.go b/main.go new file mode 100644 index 0000000..f2a173f --- /dev/null +++ b/main.go @@ -0,0 +1,283 @@ +package main + +import ( + "archive/zip" + "bufio" + "bytes" + "fmt" + "io" + "io/fs" + "os" + "path/filepath" + "strings" + + "database/sql" + + _ "github.com/mattn/go-sqlite3" +) + +const ( + dsn = "/mnt/volume_tor1_01/gutenberg/chunker.db?cache=shared&mode=rwc" + target = "/mnt/volume_tor1_01/gutenberg/aleph.gutenberg.org" +) + +func connectDB() (*sql.DB, error) { + db, err := sql.Open("sqlite3", dsn) + if err != nil { + return nil, err + } + + return db, nil +} + +func createSchema(db *sql.DB) error { + s := ` + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY, + name TEXT, + author TEXT, + filename TEXT, + content TEXT + ); + + CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY, + chunk TEXT, + sourceid INTEGER, + + FOREIGN KEY (sourceid) REFERENCES files(sourceid) + )` + + _, err := db.Exec(s) + + return err +} + +func _main() error { + db, err := connectDB() + if err != nil { + return fmt.Errorf("could not connect to %s: %w", dsn, err) + } + + if err = createSchema(db); err != nil { + return fmt.Errorf("failed to create db schema: %w", err) + } + + //return readFiles(db) + + return makeChunks(db) + + //return nil +} + +type bookfile struct { + ID int + Name string + Author string + Content string + Filename string +} + +func extractChunks(db *sql.DB, id int) error { + var err error + var tx *sql.Tx + var stmt *sql.Stmt + + tx, err = db.Begin() + end := func() { + if err != nil { + tx.Rollback() + } else { + tx.Commit() + } + } + defer end() + if err != nil { + return err + } + + stmt, err = tx.Prepare("SELECT name, author, filename, content FROM files WHERE id = ?") + if err != nil { + return err + } + + row := stmt.QueryRow(id) + var b bookfile + b.ID = id + err = row.Scan(&b.Name, &b.Author, &b.Filename, &b.Content) + if err != nil { + return err + } + stmt.Close() + + s := bufio.NewScanner(strings.NewReader(b.Content)) + inHeader := true + inFooter := false + chunk := "" + for s.Scan() { + text := strings.TrimSpace(s.Text()) + if inFooter { + break + } + if strings.HasPrefix(text, "*** START") { + inHeader = false + continue + } + if inHeader { + continue + } + if strings.HasPrefix(text, "*** END") { + inFooter = true + } + if text == "" { + // end of "paragraph" + if len(chunk) < 300 { + chunk = "" + continue + } + } else { + chunk += text + "\n" + continue + } + stmt, err = tx.Prepare("INSERT INTO chunks (sourceid, chunk) VALUES (?, ?)") + if err != nil { + return fmt.Errorf("could not prepare: %w", err) + } + + _, err = stmt.Exec(b.ID, chunk) + if err != nil { + return fmt.Errorf("could not insert: %w", err) + } + stmt.Close() + chunk = "" + } + return nil +} + +func makeChunks(db *sql.DB) error { + ids := []int{} + + rows, err := db.Query("SELECT id FROM files") + if err != nil { + return err + } + + for rows.Next() { + var id int + err = rows.Scan(&id) + if err != nil { + return err + } + ids = append(ids, id) + } + + rows.Close() + + max := len(ids) + + for x, id := range ids { + fmt.Printf("%d of %d\r", x, max) + err = extractChunks(db, id) + if err != nil { + return err + } + } + + return nil +} + +func readFiles(db *sql.DB) error { + return filepath.WalkDir(target, func(path string, d fs.DirEntry, err error) error { + if err != nil || d.IsDir() || + !strings.HasSuffix(d.Name(), "zip") || + strings.HasSuffix(d.Name(), "-8.zip") || + strings.HasSuffix(d.Name(), "-0.zip") { + return err + } + + r, err := zip.OpenReader(path) + if err != nil { + return err + } + defer r.Close() + + for x, f := range r.File { + if !strings.HasSuffix(f.Name, ".txt") { + fmt.Println("skipping ", f.Name) + continue + } + fmt.Println("doin ", f.Name) + if x > 0 { + break + } + c, err := f.Open() + if err != nil { + return err + } + bs := bytes.NewBuffer([]byte{}) + if _, err = io.Copy(bs, c); err != nil { + return err + } + name, author := extractNameAuthor(*bs) + if name == "" { + name = f.Name + } + + stmt, err := db.Prepare("INSERT INTO files (name, author, content, filename) VALUES (?, ?, ?, ?)") + if err != nil { + return err + } + defer stmt.Close() + _, err = stmt.Exec(name, author, bs.String(), f.Name) + if err != nil { + return err + } + } + + return nil + }) +} + +func extractNameAuthor(content bytes.Buffer) (string, string) { + s := bufio.NewScanner(&content) + c := 0 + + var title string + var author string + + for s.Scan() { + if author != "" && title != "" { + break + } + + text := strings.TrimSpace(s.Text()) + + if strings.HasPrefix(text, "***") { + break + } + + if strings.HasPrefix(text, "Title") { + sp := strings.SplitN(text, ":", 2) + if len(sp) == 2 { + title = strings.TrimSpace(sp[1]) + } + } + + if strings.HasPrefix(text, "Author") { + sp := strings.SplitN(text, ":", 2) + if len(sp) == 2 { + author = strings.TrimSpace(sp[1]) + } + } + + c++ + } + + return title, author +} + +func main() { + if err := _main(); err != nil { + fmt.Fprintf(os.Stderr, "error: "+err.Error()) + } +}