This commit is contained in:
nate smith 2024-07-06 00:22:06 -05:00
parent 6632a3e9de
commit adbdb8008d
3 changed files with 134 additions and 4 deletions

View File

@ -19,7 +19,7 @@ func init() {
rootCmd.AddCommand(cutupCmd) rootCmd.AddCommand(cutupCmd)
} }
var validFlavors = []string{"gutenberg"} var validFlavors = []string{"gutenberg", "geocities"}
var cutupCmd = &cobra.Command{ var cutupCmd = &cobra.Command{
Use: "cutup", Use: "cutup",

View File

@ -38,13 +38,15 @@ func extractGutenbergTitle(s string) string {
} }
func Cutup(opts CutupOpts) error { func Cutup(opts CutupOpts) error {
if opts.Flavor == "gutenberg" { switch opts.Flavor {
case "gutenberg":
opts.headerEndCheck = gutenbergHeaderEndCheck opts.headerEndCheck = gutenbergHeaderEndCheck
opts.footerBeginCheck = gutenbergFooterBeginCheck opts.footerBeginCheck = gutenbergFooterBeginCheck
} else { default:
opts.headerEndCheck = defaultHeaderEndCheck opts.headerEndCheck = defaultHeaderEndCheck
opts.footerBeginCheck = defaultFooterBeginCheck opts.footerBeginCheck = defaultFooterBeginCheck
} }
err := os.Mkdir(opts.CutupDir, 0775) err := os.Mkdir(opts.CutupDir, 0775)
if err != nil { if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err) return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
@ -109,6 +111,11 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
var text string var text string
var prefix string var prefix string
// geocities
var inTag bool
var tagSkip bool
tagBuff := []byte{}
for s.Scan() { for s.Scan() {
text = strings.TrimSpace(s.Text()) text = strings.TrimSpace(s.Text())
if inHeader && opts.headerEndCheck(text) { if inHeader && opts.headerEndCheck(text) {
@ -142,6 +149,23 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
} }
} }
for i, r := range text { for i, r := range text {
if opts.Flavor == "geocities" {
if r == '<' {
inTag = true
continue
} else if r == '>' {
tagSkip = shouldSkipLine(string(tagBuff))
inTag = false
tagBuff = []byte{}
}
if inTag {
tagBuff = append(tagBuff, byte(r))
continue
}
if tagSkip {
continue
}
}
if v := shouldBreak(phraseBuff, r); v >= 0 { if v := shouldBreak(phraseBuff, r); v >= 0 {
if len(phraseBuff) > 0 { if len(phraseBuff) > 0 {
phraseBuff = phraseBuff[0 : len(phraseBuff)-v] phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
@ -306,3 +330,23 @@ func clean(bs []byte) string {
return s return s
} }
var ignoreTags = []string{
"head",
"script",
"style",
}
func shouldSkipLine(tagBuff string) bool {
var s string
for _, t := range ignoreTags {
s = strings.ToLower(tagBuff)
if strings.Contains(s, "/"+t) {
return false
}
if strings.Contains(s, t) {
return true
}
}
return false
}

View File

@ -85,7 +85,7 @@ func Test_shouldBreak(t *testing.T) {
{ {
name: "phrase marker", name: "phrase marker",
args: args{[]byte("whither good"), ';'}, args: args{[]byte("whither good"), ';'},
expected: 1, expected: 0,
}, },
// TODO test phrasemarkers // TODO test phrasemarkers
} }
@ -190,3 +190,89 @@ func Test_clean(t *testing.T) {
}) })
} }
} }
func test_shouldSkipLine(t *testing.T) {
cases := []struct {
name string
arg string
expected bool
}{
{
name: "blank",
arg: "",
},
{
name: "lol",
arg: "lol",
},
{
name: "head",
arg: "head",
expected: true,
},
{
name: "HEAD",
arg: "HEAD",
expected: true,
},
{
name: "/HEAD",
arg: "/HEAD",
expected: false,
},
{
name: "/head",
arg: "/head",
expected: false,
},
{
name: "style",
arg: "style",
expected: true,
},
{
name: "STYLE",
arg: "STYLE",
expected: true,
},
{
name: "/STYLE",
arg: "/STYLE",
expected: false,
},
{
name: "/style",
arg: "/style",
expected: false,
},
{
name: "script",
arg: "script",
expected: true,
},
{
name: "SCRIPT",
arg: "SCRIPT",
expected: true,
},
{
name: "/SCRIPT",
arg: "/SCRIPT",
expected: false,
},
{
name: "/script",
arg: "/script",
expected: false,
},
}
for _, c := range cases {
t.Run(c.arg, func(t *testing.T) {
result := shouldSkipLine(c.arg)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
})
}
}