284 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			284 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package main
 | |
| 
 | |
| import (
 | |
| 	"archive/zip"
 | |
| 	"bufio"
 | |
| 	"bytes"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"io/fs"
 | |
| 	"os"
 | |
| 	"path/filepath"
 | |
| 	"strings"
 | |
| 
 | |
| 	"database/sql"
 | |
| 
 | |
| 	_ "github.com/mattn/go-sqlite3"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	dsn    = "/mnt/volume_tor1_01/gutenberg/chunker.db?cache=shared&mode=rwc"
 | |
| 	target = "/mnt/volume_tor1_01/gutenberg/aleph.gutenberg.org"
 | |
| )
 | |
| 
 | |
| func connectDB() (*sql.DB, error) {
 | |
| 	db, err := sql.Open("sqlite3", dsn)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	return db, nil
 | |
| }
 | |
| 
 | |
| func createSchema(db *sql.DB) error {
 | |
| 	s := `
 | |
| 		CREATE TABLE IF NOT EXISTS files (
 | |
| 			id       INTEGER PRIMARY KEY,
 | |
| 			name     TEXT,
 | |
| 			author   TEXT,
 | |
| 			filename TEXT,
 | |
| 			content  TEXT
 | |
| 		);
 | |
| 
 | |
| 		CREATE TABLE IF NOT EXISTS chunks (
 | |
| 			id       INTEGER PRIMARY KEY,
 | |
| 			chunk    TEXT,
 | |
| 			sourceid INTEGER,
 | |
| 
 | |
| 			FOREIGN KEY (sourceid) REFERENCES files(sourceid)
 | |
| 		)`
 | |
| 
 | |
| 	_, err := db.Exec(s)
 | |
| 
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| func _main() error {
 | |
| 	db, err := connectDB()
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("could not connect to %s: %w", dsn, err)
 | |
| 	}
 | |
| 
 | |
| 	if err = createSchema(db); err != nil {
 | |
| 		return fmt.Errorf("failed to create db schema: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	//return readFiles(db)
 | |
| 
 | |
| 	return makeChunks(db)
 | |
| 
 | |
| 	//return nil
 | |
| }
 | |
| 
 | |
| type bookfile struct {
 | |
| 	ID       int
 | |
| 	Name     string
 | |
| 	Author   string
 | |
| 	Content  string
 | |
| 	Filename string
 | |
| }
 | |
| 
 | |
| func extractChunks(db *sql.DB, id int) error {
 | |
| 	var err error
 | |
| 	var tx *sql.Tx
 | |
| 	var stmt *sql.Stmt
 | |
| 
 | |
| 	tx, err = db.Begin()
 | |
| 	end := func() {
 | |
| 		if err != nil {
 | |
| 			tx.Rollback()
 | |
| 		} else {
 | |
| 			tx.Commit()
 | |
| 		}
 | |
| 	}
 | |
| 	defer end()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	stmt, err = tx.Prepare("SELECT name, author, filename, content FROM files WHERE id = ?")
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	row := stmt.QueryRow(id)
 | |
| 	var b bookfile
 | |
| 	b.ID = id
 | |
| 	err = row.Scan(&b.Name, &b.Author, &b.Filename, &b.Content)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	stmt.Close()
 | |
| 
 | |
| 	s := bufio.NewScanner(strings.NewReader(b.Content))
 | |
| 	inHeader := true
 | |
| 	inFooter := false
 | |
| 	chunk := ""
 | |
| 	for s.Scan() {
 | |
| 		text := strings.TrimSpace(s.Text())
 | |
| 		if inFooter {
 | |
| 			break
 | |
| 		}
 | |
| 		if strings.HasPrefix(text, "*** START") {
 | |
| 			inHeader = false
 | |
| 			continue
 | |
| 		}
 | |
| 		if inHeader {
 | |
| 			continue
 | |
| 		}
 | |
| 		if strings.HasPrefix(text, "*** END") {
 | |
| 			inFooter = true
 | |
| 		}
 | |
| 		if text == "" {
 | |
| 			// end of "paragraph"
 | |
| 			if len(chunk) < 300 {
 | |
| 				chunk = ""
 | |
| 				continue
 | |
| 			}
 | |
| 		} else {
 | |
| 			chunk += text + "\n"
 | |
| 			continue
 | |
| 		}
 | |
| 		stmt, err = tx.Prepare("INSERT INTO chunks (sourceid, chunk) VALUES (?, ?)")
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("could not prepare: %w", err)
 | |
| 		}
 | |
| 
 | |
| 		_, err = stmt.Exec(b.ID, chunk)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("could not insert: %w", err)
 | |
| 		}
 | |
| 		stmt.Close()
 | |
| 		chunk = ""
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func makeChunks(db *sql.DB) error {
 | |
| 	ids := []int{}
 | |
| 
 | |
| 	rows, err := db.Query("SELECT id FROM files")
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	for rows.Next() {
 | |
| 		var id int
 | |
| 		err = rows.Scan(&id)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		ids = append(ids, id)
 | |
| 	}
 | |
| 
 | |
| 	rows.Close()
 | |
| 
 | |
| 	max := len(ids)
 | |
| 
 | |
| 	for x, id := range ids {
 | |
| 		fmt.Printf("%d of %d\r", x, max)
 | |
| 		err = extractChunks(db, id)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func readFiles(db *sql.DB) error {
 | |
| 	return filepath.WalkDir(target, func(path string, d fs.DirEntry, err error) error {
 | |
| 		if err != nil || d.IsDir() ||
 | |
| 			!strings.HasSuffix(d.Name(), "zip") ||
 | |
| 			strings.HasSuffix(d.Name(), "-8.zip") ||
 | |
| 			strings.HasSuffix(d.Name(), "-0.zip") {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		r, err := zip.OpenReader(path)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		defer r.Close()
 | |
| 
 | |
| 		for x, f := range r.File {
 | |
| 			if !strings.HasSuffix(f.Name, ".txt") {
 | |
| 				fmt.Println("skipping ", f.Name)
 | |
| 				continue
 | |
| 			}
 | |
| 			fmt.Println("doin ", f.Name)
 | |
| 			if x > 0 {
 | |
| 				break
 | |
| 			}
 | |
| 			c, err := f.Open()
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			bs := bytes.NewBuffer([]byte{})
 | |
| 			if _, err = io.Copy(bs, c); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			name, author := extractNameAuthor(*bs)
 | |
| 			if name == "" {
 | |
| 				name = f.Name
 | |
| 			}
 | |
| 
 | |
| 			stmt, err := db.Prepare("INSERT INTO files (name, author, content, filename) VALUES (?, ?, ?, ?)")
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			defer stmt.Close()
 | |
| 			_, err = stmt.Exec(name, author, bs.String(), f.Name)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		return nil
 | |
| 	})
 | |
| }
 | |
| 
 | |
| func extractNameAuthor(content bytes.Buffer) (string, string) {
 | |
| 	s := bufio.NewScanner(&content)
 | |
| 	c := 0
 | |
| 
 | |
| 	var title string
 | |
| 	var author string
 | |
| 
 | |
| 	for s.Scan() {
 | |
| 		if author != "" && title != "" {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		text := strings.TrimSpace(s.Text())
 | |
| 
 | |
| 		if strings.HasPrefix(text, "***") {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		if strings.HasPrefix(text, "Title") {
 | |
| 			sp := strings.SplitN(text, ":", 2)
 | |
| 			if len(sp) == 2 {
 | |
| 				title = strings.TrimSpace(sp[1])
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if strings.HasPrefix(text, "Author") {
 | |
| 			sp := strings.SplitN(text, ":", 2)
 | |
| 			if len(sp) == 2 {
 | |
| 				author = strings.TrimSpace(sp[1])
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		c++
 | |
| 	}
 | |
| 
 | |
| 	return title, author
 | |
| }
 | |
| 
 | |
| func main() {
 | |
| 	if err := _main(); err != nil {
 | |
| 		fmt.Fprintf(os.Stderr, "error: "+err.Error())
 | |
| 	}
 | |
| }
 |