package main import ( "bytes" "context" "fmt" "io" "log" "net/http" "sort" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) const maxResponseSize = 1e6 // 1MB // https://docs.joinmastodon.org/spec/microformats/ // Root elements (h-*) // // h-feed // Represents a stream of entries. Attached to a profile's toots. Also // attached to the parent thread within detailed status views. // // h-entry // Represents episodic or date stamped online content. Attached to a status. // // URL properties (u-*) // // u-photo // Within h-card, represents the profile picture. Attached to the avatar image. // // u-uid // Within h-entry or h-cite, represents a universally unique identifier. // Attached to timestamp link. // // u-url // Within h-entry or h-cite, represents the status permalink. Attached to // timestamp link. Within h-card, represents the profile permalink. // Attached to display name link. // // Datetime properties (dt-*) // // dt-published // Within h-entry or h-cite, represents the date and time at which the // status was published. Attached to data element with value attribute. // // Element tree (e-*) // // e-content // Within h-entry or h-cite, represents the content of the status. Attached to status content. // // I learned after writing this that mastodon also has RSS feeds // e.g. https://tiny.tilde.website/@magical.rss type MastoSource struct { Items []*MastoItem Title string URL string LastFetch time.Time Error error LastStatusCode int LastStatus string mu sync.Mutex } var _ Source = &MastoSource{} type MastoFeed struct { Title string Items []*MastoItem } type MastoItem struct { Title string `json:"title,omitempty"` Content string `json:"content,omitempty"` Link string `json:"link,omitempty"` PublishedString string `json:"published,omitempty"` Author string `json:"author,omitempty"` IsBoost bool `json:"is_boost,omitempty"` } func NewMastoSource(url string) *MastoSource { return &MastoSource{ URL: url, } } func (src *MastoSource) update(ctx context.Context) { src.mu.Lock() defer src.mu.Unlock() req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil) if err != nil { src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err) log.Println(src.Error) return // return err? } req.Header.Set("User-Agent", UserAgent) // TODO: If-Modified-Since, Etag resp, err := http.DefaultClient.Do(req) if err != nil { err := fmt.Errorf("error fetching %q: %w", src.URL, err) log.Println(err) src.Error = err return // return err? } if resp != nil && resp.Body != nil { defer func() { err := resp.Body.Close() if err != nil { log.Printf("error closing response body for %q: %v", src.URL, err) } }() } src.LastStatusCode = resp.StatusCode src.LastStatus = resp.Status if resp.StatusCode != 200 { src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status) log.Println(src.Error) return } // TODO: cache body body := MaxBytesReader(resp.Body, maxResponseSize) feed, err := parseMicroformats(body) if err != nil { err := fmt.Errorf("error parsing %q: %w", src.URL, err) log.Println(err) src.Error = err return // return err? } items := feed.Items sort.Slice(items, func(i, j int) bool { return items[i].PublishedString >= items[j].PublishedString }) src.Title = feed.Title src.Items = items src.LastFetch = time.Now() src.Error = nil } func parseMicroformats(r io.Reader) (*MastoFeed, error) { doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil, err } feed := new(MastoFeed) if doc.Find(".h-feed").Length() == 0 { return nil, fmt.Errorf("no feed content found") } feed.Title = doc.Find(".h-feed > .p-name").First().AttrOr("value", "") doc.Find(".h-feed").Find(".h-entry, .h-cite").Each(func(i int, elem *goquery.Selection) { cw := strings.TrimSpace(text(elem.Find(".p-summary").First())) // TODO: move this logic to GetItems if cw != "" { cw = "[" + cw + "] " } feed.Items = append(feed.Items, &MastoItem{ Title: "", Content: cw + text(elem.Find(".e-content").First()), Link: elem.Find("a.u-url.u-uid").AttrOr("href", ""), Author: text(elem.Find(".p-author .p-name").First()), PublishedString: elem.Find("data.dt-published").AttrOr("value", ""), IsBoost: elem.HasClass("h-cite"), }) }) return feed, nil } // Text gets the combined text contents of each element in the set of matched // elements, including their descendants. func text(s *goquery.Selection) string { var buf bytes.Buffer // Slightly optimized vs calling Each: no single selection object created var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.TextNode { // Keep newlines and spaces, like jQuery buf.WriteString(n.Data) } else if n.Type == html.ElementNode && n.DataAtom == atom.Br { //buf.WriteString("\n") buf.WriteString(" ") } else if n.Type == html.ElementNode && n.DataAtom == atom.P && n.PrevSibling != nil { //buf.WriteString("\n\n") buf.WriteString(" ") } else if n.Type == html.ElementNode && (n.DataAtom == atom.Script || n.DataAtom == atom.Style || n.DataAtom == atom.Template) { // nothing } if n.FirstChild != nil { for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } } for _, n := range s.Nodes { f(n) } return buf.String() } func (src *MastoSource) GetTitle() string { return src.Title } func (src *MastoSource) GetError() error { return src.Error } func (src *MastoSource) GetItems() (items []Item) { for _, x := range src.Items { text := x.Content if x.IsBoost { text = "RT @" + x.Author + ": " + text } d, _ := time.Parse(time.RFC3339, x.PublishedString) items = append(items, Item{ Date: d, Link: x.Link, Text: text, }) } return }