This commit is contained in:
nate smith 2024-07-06 00:22:06 -05:00
parent 6632a3e9de
commit adbdb8008d
3 changed files with 134 additions and 4 deletions

View File

@ -19,7 +19,7 @@ func init() {
rootCmd.AddCommand(cutupCmd)
}
var validFlavors = []string{"gutenberg"}
var validFlavors = []string{"gutenberg", "geocities"}
var cutupCmd = &cobra.Command{
Use: "cutup",

View File

@ -38,13 +38,15 @@ func extractGutenbergTitle(s string) string {
}
func Cutup(opts CutupOpts) error {
if opts.Flavor == "gutenberg" {
switch opts.Flavor {
case "gutenberg":
opts.headerEndCheck = gutenbergHeaderEndCheck
opts.footerBeginCheck = gutenbergFooterBeginCheck
} else {
default:
opts.headerEndCheck = defaultHeaderEndCheck
opts.footerBeginCheck = defaultFooterBeginCheck
}
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
@ -109,6 +111,11 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
var text string
var prefix string
// geocities
var inTag bool
var tagSkip bool
tagBuff := []byte{}
for s.Scan() {
text = strings.TrimSpace(s.Text())
if inHeader && opts.headerEndCheck(text) {
@ -142,6 +149,23 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
}
}
for i, r := range text {
if opts.Flavor == "geocities" {
if r == '<' {
inTag = true
continue
} else if r == '>' {
tagSkip = shouldSkipLine(string(tagBuff))
inTag = false
tagBuff = []byte{}
}
if inTag {
tagBuff = append(tagBuff, byte(r))
continue
}
if tagSkip {
continue
}
}
if v := shouldBreak(phraseBuff, r); v >= 0 {
if len(phraseBuff) > 0 {
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
@ -306,3 +330,23 @@ func clean(bs []byte) string {
return s
}
var ignoreTags = []string{
"head",
"script",
"style",
}
func shouldSkipLine(tagBuff string) bool {
var s string
for _, t := range ignoreTags {
s = strings.ToLower(tagBuff)
if strings.Contains(s, "/"+t) {
return false
}
if strings.Contains(s, t) {
return true
}
}
return false
}

View File

@ -85,7 +85,7 @@ func Test_shouldBreak(t *testing.T) {
{
name: "phrase marker",
args: args{[]byte("whither good"), ';'},
expected: 1,
expected: 0,
},
// TODO test phrasemarkers
}
@ -190,3 +190,89 @@ func Test_clean(t *testing.T) {
})
}
}
func test_shouldSkipLine(t *testing.T) {
cases := []struct {
name string
arg string
expected bool
}{
{
name: "blank",
arg: "",
},
{
name: "lol",
arg: "lol",
},
{
name: "head",
arg: "head",
expected: true,
},
{
name: "HEAD",
arg: "HEAD",
expected: true,
},
{
name: "/HEAD",
arg: "/HEAD",
expected: false,
},
{
name: "/head",
arg: "/head",
expected: false,
},
{
name: "style",
arg: "style",
expected: true,
},
{
name: "STYLE",
arg: "STYLE",
expected: true,
},
{
name: "/STYLE",
arg: "/STYLE",
expected: false,
},
{
name: "/style",
arg: "/style",
expected: false,
},
{
name: "script",
arg: "script",
expected: true,
},
{
name: "SCRIPT",
arg: "SCRIPT",
expected: true,
},
{
name: "/SCRIPT",
arg: "/SCRIPT",
expected: false,
},
{
name: "/script",
arg: "/script",
expected: false,
},
}
for _, c := range cases {
t.Run(c.arg, func(t *testing.T) {
result := shouldSkipLine(c.arg)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
})
}
}