gfaq stuff
This commit is contained in:
parent
e152494484
commit
7d6990a143
@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/vilmibm/trunkless/db"
|
"github.com/vilmibm/trunkless/db"
|
||||||
@ -116,7 +117,8 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
|||||||
title = extractGutenbergTitle(text)
|
title = extractGutenbergTitle(text)
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
title = path.Base(p)
|
base := path.Base(p)
|
||||||
|
title = strings.TrimSuffix(base, filepath.Ext(base))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if inHeader {
|
if inHeader {
|
||||||
@ -168,8 +170,16 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
|||||||
// there are a bunch of empty books in gutenberg :( these are text files
|
// there are a bunch of empty books in gutenberg :( these are text files
|
||||||
// that just have start and end markers with nothing in between. nothing
|
// that just have start and end markers with nothing in between. nothing
|
||||||
// i can do about it.
|
// i can do about it.
|
||||||
|
|
||||||
|
// in gfaqs I got a few no content files; they have all of their content
|
||||||
|
// on one line with a bunch of special characters. it's a pathological
|
||||||
|
// case and i'm shocked more doesn't break but somehow in this printf
|
||||||
|
// sourceid renders as '' and it doesn't end up in the title index.
|
||||||
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
|
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
|
||||||
}
|
}
|
||||||
|
// I would leave empty stuff out of sources but we need the number of
|
||||||
|
// things written to sources to match the number of initial entires. this
|
||||||
|
// means some sources in the DB that will never be used but that's fine.
|
||||||
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
|
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,8 +12,6 @@ import (
|
|||||||
"github.com/vilmibm/trunkless/db"
|
"github.com/vilmibm/trunkless/db"
|
||||||
)
|
)
|
||||||
|
|
||||||
const cutupDir = "/home/vilmibm/pg_plaintext/cutup"
|
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
// - [X] finalize gutenberg ingestion
|
// - [X] finalize gutenberg ingestion
|
||||||
// - [ ] clean up commands
|
// - [ ] clean up commands
|
||||||
@ -36,6 +34,7 @@ type IngestOpts struct {
|
|||||||
|
|
||||||
func Ingest(o IngestOpts) error {
|
func Ingest(o IngestOpts) error {
|
||||||
conn := o.Conn
|
conn := o.Conn
|
||||||
|
cutupDir := o.CutupDir
|
||||||
|
|
||||||
dir, err := os.Open(o.CutupDir)
|
dir, err := os.Open(o.CutupDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user