diff --git a/cutup/cutup.go b/cutup/cutup.go index e1720d7..8ff0dca 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -133,6 +133,7 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { } if sourceid == "" { sourceid = db.StrToID(title) + fmt.Fprintln(os.Stderr, sourceid, p) prefix = sourceid + "\t" of, err = os.Create(path.Join(opts.CutupDir, sourceid)) if err != nil { diff --git a/geocities/main.go b/geocities/main.go index 221f348..072054a 100644 --- a/geocities/main.go +++ b/geocities/main.go @@ -4,10 +4,18 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" + "unicode/utf8" ) -const p = "/home/vilmibm/geocities/UPPERCASE/geocities/YAHOOIDS" +const ( + // TODO LOWERCASE + // TODO NUMBERS + src = "/home/vilmibm/geocities/%s/geocities/YAHOOIDS" + userDirPattern = "/home/vilmibm/geocities/*/geocities/YAHOOIDS/?/?/*" + t = "/home/vilmibm/gc" +) func main() { userDirs := make(chan string) @@ -17,11 +25,12 @@ func main() { if err != nil { return err } + if !d.IsDir() { return nil } - isUserDir, err := filepath.Match("/home/vilmibm/geocities/UPPERCASE/geocities/YAHOOIDS/?/?/*", s) + isUserDir, err := filepath.Match(userDirPattern, s) if err != nil { return err } @@ -30,29 +39,163 @@ func main() { userDirs <- s } - //if len(d.Name()) > 1 && d.IsDir() { - // fmt.Printf("%s %s\n", s, d.Name()) - //} - // TODO be able to tell when s is a full path (ie to a file for user) - // TODO sniff what kind of file full path points to - // TODO if text, read and append to file for that user return nil } + go func() { - err := filepath.WalkDir(p, walkFn) - close(userDirs) - if err != nil { - panic(err) + dirs := []string{"UPPERCASE", "LOWERCASE", "NUMBERS"} + for _, dir := range dirs { + err := filepath.WalkDir(fmt.Sprintf(src, dir), walkFn) + if err != nil { + panic(err) + } } + close(userDirs) }() + + totalUserDirs := 0 + for ud := range userDirs { + totalUserDirs++ wg.Add(1) go processUserDir(&wg, ud) } + wg.Wait() + + fmt.Printf("processed %d user dirs\n", totalUserDirs) } func processUserDir(wg *sync.WaitGroup, ud string) { defer wg.Done() - fmt.Printf("GONNA PROCESS %s\n", ud) + + var outFile *os.File + walkFn := func(s string, d os.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + if ignoreSuffix(s) { + return nil + } + fmt.Printf("READING: %s\n", s) + f, err := os.Open(s) + if err != nil { + return err + } + defer f.Close() + if !wantSuffix(s) { + // if it's not a text NOR an ignore suffix, try and guess if text + + // cribbed from godoc's source + var buf [1024]byte + n, err := f.Read(buf[0:]) + if err != nil { + fmt.Printf("\terror reading file: %s\n", err.Error()) + return nil + } + + if !IsText(buf[0:n]) { + fmt.Printf("NOT TEXT: %s\n", s) + return nil + } + } + + if outFile == nil { + outFile, err = os.Create(filepath.Join(t, filepath.Base(ud))) + if err != nil { + panic(err) + } + } + all, err := os.ReadFile(s) + if err != nil { + panic(err) + } + wrote, _ := outFile.Write(all) + fmt.Printf("WROTE: %s (%d bytes)\n", s, wrote) + + return nil + } + + err := filepath.WalkDir(ud, walkFn) + if err != nil { + panic(err) + } + if outFile != nil { + outFile.Close() + } +} + +// from godoc source +func IsText(s []byte) bool { + const max = 1024 // at least utf8.UTFMax + if len(s) > max { + s = s[0:max] + } + for i, c := range string(s) { + if i+utf8.UTFMax > len(s) { + // last char may be incomplete - ignore + break + } + if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' && c != '\f' { + // decoding error or control character - not a text file + return false + } + } + return true +} + +var ignoreSuffices []string +var wantSuffices []string + +func init() { + ignoreSuffices = []string{ + "jpg", + "jpeg", + "gif", + "js", + "css", + "mp3", + "wav", + "midi", + "JPG", + "JPEG", + "GIF", + "JS", + "CSS", + "MP3", + "WAV", + "MIDI", + } + wantSuffices = []string{ + "html", + "htm", + "txt", + "HTML", + "HTM", + "TXT", + } +} + +func ignoreSuffix(p string) bool { + for _, s := range ignoreSuffices { + if strings.HasSuffix(p, s) { + return true + } + } + + return false +} + +func wantSuffix(p string) bool { + for _, s := range wantSuffices { + if strings.HasSuffix(p, s) { + return true + } + } + + return false + }