package main import ( "bytes" "context" "errors" "fmt" "io" "log" "net/http" "sort" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) const maxResponseSize = 1e6 // 1MB // https://docs.joinmastodon.org/spec/microformats/ // Root elements (h-*) // // h-feed // Represents a stream of entries. Attached to a profile's toots. Also // attached to the parent thread within detailed status views. // // h-entry // Represents episodic or date stamped online content. Attached to a status. // // URL properties (u-*) // // u-photo // Within h-card, represents the profile picture. Attached to the avatar image. // // u-uid // Within h-entry or h-cite, represents a universally unique identifier. // Attached to timestamp link. // // u-url // Within h-entry or h-cite, represents the status permalink. Attached to // timestamp link. Within h-card, represents the profile permalink. // Attached to display name link. // // // // Datetime properties (dt-*) // // dt-published // Within h-entry or h-cite, represents the date and time at which the // status was published. Attached to data element with value attribute. // // Element tree (e-*) // // e-content // Within h-entry or h-cite, represents the content of the status. Attached to status content. // type MastoSource struct { Items []*MastoItem Title string URL string LastFetch time.Time Error error LastStatusCode int LastStatus string mu sync.Mutex } type MastoFeed struct { Title string Items []*MastoItem } type MastoItem struct { Title string `json:"title,omitempty"` Content string `json:"content,omitempty"` Link string `json:"link,omitempty"` PublishedString string `json:"published,omitempty"` Author string `json:"author,omitempty"` } func NewMastoSource(url string) *MastoSource { return &MastoSource{ URL: url, } } func (src *MastoSource) update(ctx context.Context) { src.mu.Lock() defer src.mu.Unlock() req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil) if err != nil { src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err) log.Println(src.Error) return // return err? } req.Header.Set("User-Agent", UserAgent) // TODO: If-Modified-Since, Etag resp, err := http.DefaultClient.Do(req) if err != nil { err := fmt.Errorf("error fetching %q: %w", src.URL, err) log.Println(err) src.Error = err return // return err? } if resp != nil && resp.Body != nil { defer func() { err := resp.Body.Close() if err != nil { log.Printf("error closing response body for %q: %v", src.URL, err) } }() } src.LastStatusCode = resp.StatusCode src.LastStatus = resp.Status if resp.StatusCode != 200 { src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status) log.Println(src.Error) return } // TODO: cache body body := MaxBytesReader(resp.Body, maxResponseSize) feed, err := parseMicroformats(body) if err != nil { err := fmt.Errorf("error parsing %q: %w", src.URL, err) log.Println(err) src.Error = err return // return err? } items := feed.Items sort.Slice(items, func(i, j int) bool { return items[i].PublishedString >= items[j].PublishedString }) src.Title = feed.Title src.Items = items src.LastFetch = time.Now() src.Error = nil } func parseMicroformats(r io.Reader) (*MastoFeed, error) { doc, err := goquery.NewDocumentFromReader(r) if err != nil { return nil, err } feed := new(MastoFeed) if doc.Find(".h-feed").Length() == 0 { return nil, fmt.Errorf("no feed content found") } feed.Title = doc.Find(".h-feed > .p-name").First().AttrOr("value", "") doc.Find(".h-feed .h-entry").Each(func(i int, elem *goquery.Selection) { cw := strings.TrimSpace(text(elem.Find(".p-summary"))) if cw != "" { cw = "[" + cw + "] " } feed.Items = append(feed.Items, &MastoItem{ Title: "", Content: cw + text(elem.Find(".e-content")), Link: elem.Find("a.u-url.u-uid").AttrOr("href", ""), Author: text(elem.Find(".p-author .p-name")), PublishedString: elem.Find("data.dt-published").AttrOr("value", ""), }) }) return feed, nil } // Text gets the combined text contents of each element in the set of matched // elements, including their descendants. func text(s *goquery.Selection) string { var buf bytes.Buffer // Slightly optimized vs calling Each: no single selection object created var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.TextNode { // Keep newlines and spaces, like jQuery buf.WriteString(n.Data) } else if n.Type == html.ElementNode && n.DataAtom == atom.Br { //buf.WriteString("\n") buf.WriteString(" ") } else if n.Type == html.ElementNode && n.DataAtom == atom.P && n.PrevSibling != nil { //buf.WriteString("\n\n") buf.WriteString(" ") } if n.FirstChild != nil { for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } } for _, n := range s.Nodes { f(n) } return buf.String() } // MaxBytesReader is similar to io.LimitReader but is intended for // limiting the size of incoming request bodies. In contrast to // io.LimitReader, MaxBytesReader's result is a ReadCloser, returns a // non-EOF error for a Read beyond the limit, and closes the // underlying reader when its Close method is called. // // MaxBytesReader prevents clients from accidentally or maliciously // sending a large request and wasting server resources. // // Based on http.MaxBytesReader func MaxBytesReader(r io.ReadCloser, n int64) io.ReadCloser { if n < 0 { // Treat negative limits as equivalent to 0. n = 0 } return &maxBytesReader{r: r, n: n} } type maxBytesReader struct { r io.ReadCloser // underlying reader n int64 // max bytes remaining err error // sticky error } func (l *maxBytesReader) Read(p []byte) (n int, err error) { if l.err != nil { return 0, l.err } if len(p) == 0 { return 0, nil } // If they asked for a 32KB byte read but only 5 bytes are // remaining, no need to read 32KB. 6 bytes will answer the // question of the whether we hit the limit or go past it. if int64(len(p)) > l.n+1 { p = p[:l.n+1] } n, err = l.r.Read(p) if int64(n) <= l.n { l.n -= int64(n) l.err = err return n, err } n = int(l.n) l.n = 0 l.err = errors.New("http: response body too large") return n, l.err } func (l *maxBytesReader) Close() error { return l.r.Close() }