284 lines
4.8 KiB
Go
284 lines
4.8 KiB
Go
package main
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bufio"
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"database/sql"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
const (
|
|
dsn = "/mnt/volume_tor1_01/gutenberg/chunker.db?cache=shared&mode=rwc"
|
|
target = "/mnt/volume_tor1_01/gutenberg/aleph.gutenberg.org"
|
|
)
|
|
|
|
func connectDB() (*sql.DB, error) {
|
|
db, err := sql.Open("sqlite3", dsn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return db, nil
|
|
}
|
|
|
|
func createSchema(db *sql.DB) error {
|
|
s := `
|
|
CREATE TABLE IF NOT EXISTS files (
|
|
id INTEGER PRIMARY KEY,
|
|
name TEXT,
|
|
author TEXT,
|
|
filename TEXT,
|
|
content TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS chunks (
|
|
id INTEGER PRIMARY KEY,
|
|
chunk TEXT,
|
|
sourceid INTEGER,
|
|
|
|
FOREIGN KEY (sourceid) REFERENCES files(sourceid)
|
|
)`
|
|
|
|
_, err := db.Exec(s)
|
|
|
|
return err
|
|
}
|
|
|
|
func _main() error {
|
|
db, err := connectDB()
|
|
if err != nil {
|
|
return fmt.Errorf("could not connect to %s: %w", dsn, err)
|
|
}
|
|
|
|
if err = createSchema(db); err != nil {
|
|
return fmt.Errorf("failed to create db schema: %w", err)
|
|
}
|
|
|
|
//return readFiles(db)
|
|
|
|
return makeChunks(db)
|
|
|
|
//return nil
|
|
}
|
|
|
|
type bookfile struct {
|
|
ID int
|
|
Name string
|
|
Author string
|
|
Content string
|
|
Filename string
|
|
}
|
|
|
|
func extractChunks(db *sql.DB, id int) error {
|
|
var err error
|
|
var tx *sql.Tx
|
|
var stmt *sql.Stmt
|
|
|
|
tx, err = db.Begin()
|
|
end := func() {
|
|
if err != nil {
|
|
tx.Rollback()
|
|
} else {
|
|
tx.Commit()
|
|
}
|
|
}
|
|
defer end()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stmt, err = tx.Prepare("SELECT name, author, filename, content FROM files WHERE id = ?")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
row := stmt.QueryRow(id)
|
|
var b bookfile
|
|
b.ID = id
|
|
err = row.Scan(&b.Name, &b.Author, &b.Filename, &b.Content)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
stmt.Close()
|
|
|
|
s := bufio.NewScanner(strings.NewReader(b.Content))
|
|
inHeader := true
|
|
inFooter := false
|
|
chunk := ""
|
|
for s.Scan() {
|
|
text := strings.TrimSpace(s.Text())
|
|
if inFooter {
|
|
break
|
|
}
|
|
if strings.HasPrefix(text, "*** START") {
|
|
inHeader = false
|
|
continue
|
|
}
|
|
if inHeader {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(text, "*** END") {
|
|
inFooter = true
|
|
}
|
|
if text == "" {
|
|
// end of "paragraph"
|
|
if len(chunk) < 300 {
|
|
chunk = ""
|
|
continue
|
|
}
|
|
} else {
|
|
chunk += text + "\n"
|
|
continue
|
|
}
|
|
stmt, err = tx.Prepare("INSERT INTO chunks (sourceid, chunk) VALUES (?, ?)")
|
|
if err != nil {
|
|
return fmt.Errorf("could not prepare: %w", err)
|
|
}
|
|
|
|
_, err = stmt.Exec(b.ID, chunk)
|
|
if err != nil {
|
|
return fmt.Errorf("could not insert: %w", err)
|
|
}
|
|
stmt.Close()
|
|
chunk = ""
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func makeChunks(db *sql.DB) error {
|
|
ids := []int{}
|
|
|
|
rows, err := db.Query("SELECT id FROM files")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for rows.Next() {
|
|
var id int
|
|
err = rows.Scan(&id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ids = append(ids, id)
|
|
}
|
|
|
|
rows.Close()
|
|
|
|
max := len(ids)
|
|
|
|
for x, id := range ids {
|
|
fmt.Printf("%d of %d\r", x, max)
|
|
err = extractChunks(db, id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func readFiles(db *sql.DB) error {
|
|
return filepath.WalkDir(target, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil || d.IsDir() ||
|
|
!strings.HasSuffix(d.Name(), "zip") ||
|
|
strings.HasSuffix(d.Name(), "-8.zip") ||
|
|
strings.HasSuffix(d.Name(), "-0.zip") {
|
|
return err
|
|
}
|
|
|
|
r, err := zip.OpenReader(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer r.Close()
|
|
|
|
for x, f := range r.File {
|
|
if !strings.HasSuffix(f.Name, ".txt") {
|
|
fmt.Println("skipping ", f.Name)
|
|
continue
|
|
}
|
|
fmt.Println("doin ", f.Name)
|
|
if x > 0 {
|
|
break
|
|
}
|
|
c, err := f.Open()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
bs := bytes.NewBuffer([]byte{})
|
|
if _, err = io.Copy(bs, c); err != nil {
|
|
return err
|
|
}
|
|
name, author := extractNameAuthor(*bs)
|
|
if name == "" {
|
|
name = f.Name
|
|
}
|
|
|
|
stmt, err := db.Prepare("INSERT INTO files (name, author, content, filename) VALUES (?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
_, err = stmt.Exec(name, author, bs.String(), f.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func extractNameAuthor(content bytes.Buffer) (string, string) {
|
|
s := bufio.NewScanner(&content)
|
|
c := 0
|
|
|
|
var title string
|
|
var author string
|
|
|
|
for s.Scan() {
|
|
if author != "" && title != "" {
|
|
break
|
|
}
|
|
|
|
text := strings.TrimSpace(s.Text())
|
|
|
|
if strings.HasPrefix(text, "***") {
|
|
break
|
|
}
|
|
|
|
if strings.HasPrefix(text, "Title") {
|
|
sp := strings.SplitN(text, ":", 2)
|
|
if len(sp) == 2 {
|
|
title = strings.TrimSpace(sp[1])
|
|
}
|
|
}
|
|
|
|
if strings.HasPrefix(text, "Author") {
|
|
sp := strings.SplitN(text, ":", 2)
|
|
if len(sp) == 2 {
|
|
author = strings.TrimSpace(sp[1])
|
|
}
|
|
}
|
|
|
|
c++
|
|
}
|
|
|
|
return title, author
|
|
}
|
|
|
|
func main() {
|
|
if err := _main(); err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: "+err.Error())
|
|
}
|
|
}
|