diff --git a/cutup/cutup.go b/cutup/cutup.go index 21937e1..e1720d7 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path" + "path/filepath" "strings" "github.com/vilmibm/trunkless/db" @@ -116,7 +117,8 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { title = extractGutenbergTitle(text) continue } else { - title = path.Base(p) + base := path.Base(p) + title = strings.TrimSuffix(base, filepath.Ext(base)) } } if inHeader { @@ -168,8 +170,16 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { // there are a bunch of empty books in gutenberg :( these are text files // that just have start and end markers with nothing in between. nothing // i can do about it. + + // in gfaqs I got a few no content files; they have all of their content + // on one line with a bunch of special characters. it's a pathological + // case and i'm shocked more doesn't break but somehow in this printf + // sourceid renders as '' and it doesn't end up in the title index. fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p) } + // I would leave empty stuff out of sources but we need the number of + // things written to sources to match the number of initial entires. this + // means some sources in the DB that will never be used but that's fine. sources <- fmt.Sprintf("%s\t%s", sourceid, title) } } diff --git a/ingest/ingest.go b/ingest/ingest.go index 5eaa0fc..2af7c98 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -12,8 +12,6 @@ import ( "github.com/vilmibm/trunkless/db" ) -const cutupDir = "/home/vilmibm/pg_plaintext/cutup" - // TODO // - [X] finalize gutenberg ingestion // - [ ] clean up commands @@ -36,6 +34,7 @@ type IngestOpts struct { func Ingest(o IngestOpts) error { conn := o.Conn + cutupDir := o.CutupDir dir, err := os.Open(o.CutupDir) if err != nil {