gfaq stuff

This commit is contained in:
nate smith 2024-06-30 13:39:32 -07:00
parent e152494484
commit 7d6990a143
2 changed files with 12 additions and 3 deletions

View File

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"os" "os"
"path" "path"
"path/filepath"
"strings" "strings"
"github.com/vilmibm/trunkless/db" "github.com/vilmibm/trunkless/db"
@ -116,7 +117,8 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
title = extractGutenbergTitle(text) title = extractGutenbergTitle(text)
continue continue
} else { } else {
title = path.Base(p) base := path.Base(p)
title = strings.TrimSuffix(base, filepath.Ext(base))
} }
} }
if inHeader { if inHeader {
@ -168,8 +170,16 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
// there are a bunch of empty books in gutenberg :( these are text files // there are a bunch of empty books in gutenberg :( these are text files
// that just have start and end markers with nothing in between. nothing // that just have start and end markers with nothing in between. nothing
// i can do about it. // i can do about it.
// in gfaqs I got a few no content files; they have all of their content
// on one line with a bunch of special characters. it's a pathological
// case and i'm shocked more doesn't break but somehow in this printf
// sourceid renders as '' and it doesn't end up in the title index.
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p) fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
} }
// I would leave empty stuff out of sources but we need the number of
// things written to sources to match the number of initial entires. this
// means some sources in the DB that will never be used but that's fine.
sources <- fmt.Sprintf("%s\t%s", sourceid, title) sources <- fmt.Sprintf("%s\t%s", sourceid, title)
} }
} }

View File

@ -12,8 +12,6 @@ import (
"github.com/vilmibm/trunkless/db" "github.com/vilmibm/trunkless/db"
) )
const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
// TODO // TODO
// - [X] finalize gutenberg ingestion // - [X] finalize gutenberg ingestion
// - [ ] clean up commands // - [ ] clean up commands
@ -36,6 +34,7 @@ type IngestOpts struct {
func Ingest(o IngestOpts) error { func Ingest(o IngestOpts) error {
conn := o.Conn conn := o.Conn
cutupDir := o.CutupDir
dir, err := os.Open(o.CutupDir) dir, err := os.Open(o.CutupDir)
if err != nil { if err != nil {