From adbdb8008d7260e8c8c5ee617969e6893229bd21 Mon Sep 17 00:00:00 2001 From: nate smith Date: Sat, 6 Jul 2024 00:22:06 -0500 Subject: [PATCH] meow --- cmd/cutup.go | 2 +- cutup/cutup.go | 48 +++++++++++++++++++++++-- cutup/cutup_test.go | 88 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/cmd/cutup.go b/cmd/cutup.go index ed5514a..fb8f0c3 100644 --- a/cmd/cutup.go +++ b/cmd/cutup.go @@ -19,7 +19,7 @@ func init() { rootCmd.AddCommand(cutupCmd) } -var validFlavors = []string{"gutenberg"} +var validFlavors = []string{"gutenberg", "geocities"} var cutupCmd = &cobra.Command{ Use: "cutup", diff --git a/cutup/cutup.go b/cutup/cutup.go index 8ff0dca..f4c3b97 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -38,13 +38,15 @@ func extractGutenbergTitle(s string) string { } func Cutup(opts CutupOpts) error { - if opts.Flavor == "gutenberg" { + switch opts.Flavor { + case "gutenberg": opts.headerEndCheck = gutenbergHeaderEndCheck opts.footerBeginCheck = gutenbergFooterBeginCheck - } else { + default: opts.headerEndCheck = defaultHeaderEndCheck opts.footerBeginCheck = defaultFooterBeginCheck } + err := os.Mkdir(opts.CutupDir, 0775) if err != nil { return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err) @@ -109,6 +111,11 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { var text string var prefix string + // geocities + var inTag bool + var tagSkip bool + tagBuff := []byte{} + for s.Scan() { text = strings.TrimSpace(s.Text()) if inHeader && opts.headerEndCheck(text) { @@ -142,6 +149,23 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { } } for i, r := range text { + if opts.Flavor == "geocities" { + if r == '<' { + inTag = true + continue + } else if r == '>' { + tagSkip = shouldSkipLine(string(tagBuff)) + inTag = false + tagBuff = []byte{} + } + if inTag { + tagBuff = append(tagBuff, byte(r)) + continue + } + if tagSkip { + continue + } + } if v := shouldBreak(phraseBuff, r); v >= 0 { if len(phraseBuff) > 0 { phraseBuff = phraseBuff[0 : len(phraseBuff)-v] @@ -306,3 +330,23 @@ func clean(bs []byte) string { return s } + +var ignoreTags = []string{ + "head", + "script", + "style", +} + +func shouldSkipLine(tagBuff string) bool { + var s string + for _, t := range ignoreTags { + s = strings.ToLower(tagBuff) + if strings.Contains(s, "/"+t) { + return false + } + if strings.Contains(s, t) { + return true + } + } + return false +} diff --git a/cutup/cutup_test.go b/cutup/cutup_test.go index 231fc7a..8f435d4 100644 --- a/cutup/cutup_test.go +++ b/cutup/cutup_test.go @@ -85,7 +85,7 @@ func Test_shouldBreak(t *testing.T) { { name: "phrase marker", args: args{[]byte("whither good"), ';'}, - expected: 1, + expected: 0, }, // TODO test phrasemarkers } @@ -190,3 +190,89 @@ func Test_clean(t *testing.T) { }) } } + +func test_shouldSkipLine(t *testing.T) { + cases := []struct { + name string + arg string + expected bool + }{ + { + name: "blank", + arg: "", + }, + { + name: "lol", + arg: "lol", + }, + { + name: "head", + arg: "head", + expected: true, + }, + { + name: "HEAD", + arg: "HEAD", + expected: true, + }, + { + name: "/HEAD", + arg: "/HEAD", + expected: false, + }, + { + name: "/head", + arg: "/head", + expected: false, + }, + { + name: "style", + arg: "style", + expected: true, + }, + { + name: "STYLE", + arg: "STYLE", + expected: true, + }, + { + name: "/STYLE", + arg: "/STYLE", + expected: false, + }, + { + name: "/style", + arg: "/style", + expected: false, + }, + { + name: "script", + arg: "script", + expected: true, + }, + { + name: "SCRIPT", + arg: "SCRIPT", + expected: true, + }, + { + name: "/SCRIPT", + arg: "/SCRIPT", + expected: false, + }, + { + name: "/script", + arg: "/script", + expected: false, + }, + } + + for _, c := range cases { + t.Run(c.arg, func(t *testing.T) { + result := shouldSkipLine(c.arg) + if result != c.expected { + t.Errorf("got '%v', expected '%v'", result, c.expected) + } + }) + } +}