nate smith 6632a3e9de wip
2024-07-01 22:20:39 -07:00

202 lines
3.2 KiB
Go

package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"unicode/utf8"
)
const (
// TODO LOWERCASE
// TODO NUMBERS
src = "/home/vilmibm/geocities/%s/geocities/YAHOOIDS"
userDirPattern = "/home/vilmibm/geocities/*/geocities/YAHOOIDS/?/?/*"
t = "/home/vilmibm/gc"
)
func main() {
userDirs := make(chan string)
var wg sync.WaitGroup
walkFn := func(s string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() {
return nil
}
isUserDir, err := filepath.Match(userDirPattern, s)
if err != nil {
return err
}
if isUserDir {
userDirs <- s
}
return nil
}
go func() {
dirs := []string{"UPPERCASE", "LOWERCASE", "NUMBERS"}
for _, dir := range dirs {
err := filepath.WalkDir(fmt.Sprintf(src, dir), walkFn)
if err != nil {
panic(err)
}
}
close(userDirs)
}()
totalUserDirs := 0
for ud := range userDirs {
totalUserDirs++
wg.Add(1)
go processUserDir(&wg, ud)
}
wg.Wait()
fmt.Printf("processed %d user dirs\n", totalUserDirs)
}
func processUserDir(wg *sync.WaitGroup, ud string) {
defer wg.Done()
var outFile *os.File
walkFn := func(s string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
if ignoreSuffix(s) {
return nil
}
fmt.Printf("READING: %s\n", s)
f, err := os.Open(s)
if err != nil {
return err
}
defer f.Close()
if !wantSuffix(s) {
// if it's not a text NOR an ignore suffix, try and guess if text
// cribbed from godoc's source
var buf [1024]byte
n, err := f.Read(buf[0:])
if err != nil {
fmt.Printf("\terror reading file: %s\n", err.Error())
return nil
}
if !IsText(buf[0:n]) {
fmt.Printf("NOT TEXT: %s\n", s)
return nil
}
}
if outFile == nil {
outFile, err = os.Create(filepath.Join(t, filepath.Base(ud)))
if err != nil {
panic(err)
}
}
all, err := os.ReadFile(s)
if err != nil {
panic(err)
}
wrote, _ := outFile.Write(all)
fmt.Printf("WROTE: %s (%d bytes)\n", s, wrote)
return nil
}
err := filepath.WalkDir(ud, walkFn)
if err != nil {
panic(err)
}
if outFile != nil {
outFile.Close()
}
}
// from godoc source
func IsText(s []byte) bool {
const max = 1024 // at least utf8.UTFMax
if len(s) > max {
s = s[0:max]
}
for i, c := range string(s) {
if i+utf8.UTFMax > len(s) {
// last char may be incomplete - ignore
break
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' && c != '\f' {
// decoding error or control character - not a text file
return false
}
}
return true
}
var ignoreSuffices []string
var wantSuffices []string
func init() {
ignoreSuffices = []string{
"jpg",
"jpeg",
"gif",
"js",
"css",
"mp3",
"wav",
"midi",
"JPG",
"JPEG",
"GIF",
"JS",
"CSS",
"MP3",
"WAV",
"MIDI",
}
wantSuffices = []string{
"html",
"htm",
"txt",
"HTML",
"HTM",
"TXT",
}
}
func ignoreSuffix(p string) bool {
for _, s := range ignoreSuffices {
if strings.HasSuffix(p, s) {
return true
}
}
return false
}
func wantSuffix(p string) bool {
for _, s := range wantSuffices {
if strings.HasSuffix(p, s) {
return true
}
}
return false
}