some cmd cleanup, thoughts on data storage change
This commit is contained in:
parent
1803154a76
commit
6b79717c3e
@ -1,24 +1,65 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
"github.com/vilmibm/trunkless/db"
|
||||||
"github.com/vilmibm/trunkless/ingest"
|
"github.com/vilmibm/trunkless/ingest"
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
// TODO option for cutupDir
|
||||||
|
|
||||||
|
ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd")
|
||||||
|
ingestCmd.MarkFlagRequired("cutupdir")
|
||||||
rootCmd.AddCommand(ingestCmd)
|
rootCmd.AddCommand(ingestCmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
var ingestCmd = &cobra.Command{
|
var ingestCmd = &cobra.Command{
|
||||||
Use: "ingest corpusname",
|
Use: "ingest corpusname",
|
||||||
|
Short: "ingest already cut-up corpora from disk into database",
|
||||||
Args: cobra.ExactArgs(1),
|
Args: cobra.ExactArgs(1),
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
switch args[0] {
|
cutupDir := cmd.Flags().Lookup("cutupdir").Value.String()
|
||||||
case "gutenberg":
|
corpus := args[0]
|
||||||
return ingest.IngestGut()
|
|
||||||
|
conn, err := db.Connect()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
return fmt.Errorf("corpus unknown: %s", args[0])
|
|
||||||
|
opts := ingest.IngestOpts{
|
||||||
|
Conn: conn,
|
||||||
|
CutupDir: cutupDir,
|
||||||
|
Corpus: corpus,
|
||||||
|
}
|
||||||
|
|
||||||
|
return ingest.Ingest(opts)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// thoughts
|
||||||
|
//
|
||||||
|
// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list.
|
||||||
|
// currently waiting on an explain analyze for:
|
||||||
|
// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e';
|
||||||
|
// planning time 12ms
|
||||||
|
// exec time 91s
|
||||||
|
// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved).
|
||||||
|
|
||||||
|
// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1).
|
||||||
|
// some options:
|
||||||
|
// - change everything so every corpus is in its own table:
|
||||||
|
// ${corpus}_phrases: id, sourceid, text
|
||||||
|
// corpora: id, name
|
||||||
|
// sources: id, corpusid, name
|
||||||
|
// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db:
|
||||||
|
// id_ranges: corpusid, minid, maxid
|
||||||
|
|
||||||
|
// thinking about this more, as i add corpora the phrases table is going to
|
||||||
|
// grow into the billions (assuming other sources are similar in scale to
|
||||||
|
// gutenberg). turns out postgresql has table partitioning but idk if that will
|
||||||
|
// help me since the ID space will be shared.
|
||||||
|
|
||||||
|
// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space.
|
||||||
|
|
||||||
|
// so it's settled; I'm going to retool for table-per-corpus.
|
||||||
|
5
db/db.go
5
db/db.go
@ -2,6 +2,7 @@ package db
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"crypto/sha1"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/jackc/pgx/v5"
|
"github.com/jackc/pgx/v5"
|
||||||
@ -13,6 +14,10 @@ const (
|
|||||||
MaxID = 345507789
|
MaxID = 345507789
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func StrToID(s string) string {
|
||||||
|
return fmt.Sprintf("%x", sha1.Sum([]byte(s)))[0:6]
|
||||||
|
}
|
||||||
|
|
||||||
func Connect() (*pgx.Conn, error) {
|
func Connect() (*pgx.Conn, error) {
|
||||||
conn, err := pgx.Connect(context.Background(), "")
|
conn, err := pgx.Connect(context.Background(), "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"path"
|
"path"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
"github.com/vilmibm/trunkless/db"
|
"github.com/vilmibm/trunkless/db"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -16,7 +17,9 @@ const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
|
|||||||
// TODO
|
// TODO
|
||||||
// - [X] finalize gutenberg ingestion
|
// - [X] finalize gutenberg ingestion
|
||||||
// - [ ] clean up commands
|
// - [ ] clean up commands
|
||||||
// - [ ] clean up repo
|
// - [X] get down to just ingest/cutup/serve
|
||||||
|
// - [ ] add arguments for generalizing
|
||||||
|
// - [X] clean up repo
|
||||||
// - [ ] push and deploy to town with new pg db
|
// - [ ] push and deploy to town with new pg db
|
||||||
// - [ ] gamefaqs extraction
|
// - [ ] gamefaqs extraction
|
||||||
// - [ ] corpus selector
|
// - [ ] corpus selector
|
||||||
@ -25,24 +28,20 @@ const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
|
|||||||
// - [ ] blog post
|
// - [ ] blog post
|
||||||
// - [ ] launch
|
// - [ ] launch
|
||||||
|
|
||||||
func IngestGut() error {
|
type IngestOpts struct {
|
||||||
conn, err := db.Connect()
|
Conn *pgx.Conn
|
||||||
if err != nil {
|
Corpus string
|
||||||
return err
|
CutupDir string
|
||||||
}
|
}
|
||||||
defer conn.Close(context.Background())
|
|
||||||
|
|
||||||
dir, err := os.Open(cutupDir)
|
func Ingest(o IngestOpts) error {
|
||||||
|
conn := o.Conn
|
||||||
|
|
||||||
|
dir, err := os.Open(o.CutupDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("could not open %s: %w", cutupDir, err)
|
return fmt.Errorf("could not open %s: %w", cutupDir, err)
|
||||||
}
|
}
|
||||||
|
defer dir.Close()
|
||||||
// echo gutenberg | sha1sum | head -c7
|
|
||||||
corpusid := "cb20c3e"
|
|
||||||
_, err = conn.Exec(context.Background(), "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", corpusid, "gutenberg")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to create gutenberg corpus: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
entries, err := dir.Readdirnames(-1)
|
entries, err := dir.Readdirnames(-1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -53,7 +52,15 @@ func IngestGut() error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to open source index: %w", err)
|
return fmt.Errorf("failed to open source index: %w", err)
|
||||||
}
|
}
|
||||||
|
defer idx.Close()
|
||||||
|
|
||||||
|
corpusid := db.StrToID(o.Corpus)
|
||||||
|
_, err = conn.Exec(context.Background(),
|
||||||
|
"INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING",
|
||||||
|
corpusid, o.Corpus)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create '%s' corpus: %w", o.Corpus, err)
|
||||||
|
}
|
||||||
tx, err := conn.Begin(context.Background())
|
tx, err := conn.Begin(context.Background())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("could not open transaction: %w", err)
|
return fmt.Errorf("could not open transaction: %w", err)
|
||||||
@ -84,6 +91,5 @@ func IngestGut() error {
|
|||||||
fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error())
|
fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user