initial commit

trunk
vilmibm 2023-07-14 04:47:20 +00:00
commit 03348a427d
4 changed files with 300 additions and 0 deletions

10
README.md 100644
View File

@ -0,0 +1,10 @@
# gutchunk
this is code I used to process about 12gb of plaintext books from Project Gutenberg.
it assumes a mirror of gutenberg books made using their [robot access](https://gutenberg.org/policy/robot_access.html) endpoint.
so far the output of this is being used on [blackout](https://blackout.tilde.town).
if you're a townie and want access to the database i made using this lmk.

5
go.mod 100644
View File

@ -0,0 +1,5 @@
module git.tilde.town/gutchunker
go 1.18
require github.com/mattn/go-sqlite3 v1.14.17

2
go.sum 100644
View File

@ -0,0 +1,2 @@
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=

283
main.go 100644
View File

@ -0,0 +1,283 @@
package main
import (
"archive/zip"
"bufio"
"bytes"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"database/sql"
_ "github.com/mattn/go-sqlite3"
)
const (
dsn = "/mnt/volume_tor1_01/gutenberg/chunker.db?cache=shared&mode=rwc"
target = "/mnt/volume_tor1_01/gutenberg/aleph.gutenberg.org"
)
func connectDB() (*sql.DB, error) {
db, err := sql.Open("sqlite3", dsn)
if err != nil {
return nil, err
}
return db, nil
}
func createSchema(db *sql.DB) error {
s := `
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY,
name TEXT,
author TEXT,
filename TEXT,
content TEXT
);
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY,
chunk TEXT,
sourceid INTEGER,
FOREIGN KEY (sourceid) REFERENCES files(sourceid)
)`
_, err := db.Exec(s)
return err
}
func _main() error {
db, err := connectDB()
if err != nil {
return fmt.Errorf("could not connect to %s: %w", dsn, err)
}
if err = createSchema(db); err != nil {
return fmt.Errorf("failed to create db schema: %w", err)
}
//return readFiles(db)
return makeChunks(db)
//return nil
}
type bookfile struct {
ID int
Name string
Author string
Content string
Filename string
}
func extractChunks(db *sql.DB, id int) error {
var err error
var tx *sql.Tx
var stmt *sql.Stmt
tx, err = db.Begin()
end := func() {
if err != nil {
tx.Rollback()
} else {
tx.Commit()
}
}
defer end()
if err != nil {
return err
}
stmt, err = tx.Prepare("SELECT name, author, filename, content FROM files WHERE id = ?")
if err != nil {
return err
}
row := stmt.QueryRow(id)
var b bookfile
b.ID = id
err = row.Scan(&b.Name, &b.Author, &b.Filename, &b.Content)
if err != nil {
return err
}
stmt.Close()
s := bufio.NewScanner(strings.NewReader(b.Content))
inHeader := true
inFooter := false
chunk := ""
for s.Scan() {
text := strings.TrimSpace(s.Text())
if inFooter {
break
}
if strings.HasPrefix(text, "*** START") {
inHeader = false
continue
}
if inHeader {
continue
}
if strings.HasPrefix(text, "*** END") {
inFooter = true
}
if text == "" {
// end of "paragraph"
if len(chunk) < 300 {
chunk = ""
continue
}
} else {
chunk += text + "\n"
continue
}
stmt, err = tx.Prepare("INSERT INTO chunks (sourceid, chunk) VALUES (?, ?)")
if err != nil {
return fmt.Errorf("could not prepare: %w", err)
}
_, err = stmt.Exec(b.ID, chunk)
if err != nil {
return fmt.Errorf("could not insert: %w", err)
}
stmt.Close()
chunk = ""
}
return nil
}
func makeChunks(db *sql.DB) error {
ids := []int{}
rows, err := db.Query("SELECT id FROM files")
if err != nil {
return err
}
for rows.Next() {
var id int
err = rows.Scan(&id)
if err != nil {
return err
}
ids = append(ids, id)
}
rows.Close()
max := len(ids)
for x, id := range ids {
fmt.Printf("%d of %d\r", x, max)
err = extractChunks(db, id)
if err != nil {
return err
}
}
return nil
}
func readFiles(db *sql.DB) error {
return filepath.WalkDir(target, func(path string, d fs.DirEntry, err error) error {
if err != nil || d.IsDir() ||
!strings.HasSuffix(d.Name(), "zip") ||
strings.HasSuffix(d.Name(), "-8.zip") ||
strings.HasSuffix(d.Name(), "-0.zip") {
return err
}
r, err := zip.OpenReader(path)
if err != nil {
return err
}
defer r.Close()
for x, f := range r.File {
if !strings.HasSuffix(f.Name, ".txt") {
fmt.Println("skipping ", f.Name)
continue
}
fmt.Println("doin ", f.Name)
if x > 0 {
break
}
c, err := f.Open()
if err != nil {
return err
}
bs := bytes.NewBuffer([]byte{})
if _, err = io.Copy(bs, c); err != nil {
return err
}
name, author := extractNameAuthor(*bs)
if name == "" {
name = f.Name
}
stmt, err := db.Prepare("INSERT INTO files (name, author, content, filename) VALUES (?, ?, ?, ?)")
if err != nil {
return err
}
defer stmt.Close()
_, err = stmt.Exec(name, author, bs.String(), f.Name)
if err != nil {
return err
}
}
return nil
})
}
func extractNameAuthor(content bytes.Buffer) (string, string) {
s := bufio.NewScanner(&content)
c := 0
var title string
var author string
for s.Scan() {
if author != "" && title != "" {
break
}
text := strings.TrimSpace(s.Text())
if strings.HasPrefix(text, "***") {
break
}
if strings.HasPrefix(text, "Title") {
sp := strings.SplitN(text, ":", 2)
if len(sp) == 2 {
title = strings.TrimSpace(sp[1])
}
}
if strings.HasPrefix(text, "Author") {
sp := strings.SplitN(text, ":", 2)
if len(sp) == 2 {
author = strings.TrimSpace(sp[1])
}
}
c++
}
return title, author
}
func main() {
if err := _main(); err != nil {
fmt.Fprintf(os.Stderr, "error: "+err.Error())
}
}