feedget/mastodon.go

235 lines
5.9 KiB
Go

package main
import (
"bytes"
"context"
"fmt"
"io"
"log"
"net/http"
"sort"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
const maxResponseSize = 1e6 // 1MB
// https://docs.joinmastodon.org/spec/microformats/
// Root elements (h-*)
//
// h-feed
// Represents a stream of entries. Attached to a profile's toots. Also
// attached to the parent thread within detailed status views.
//
// h-entry
// Represents episodic or date stamped online content. Attached to a status.
//
// URL properties (u-*)
//
// u-photo
// Within h-card, represents the profile picture. Attached to the avatar image.
//
// u-uid
// Within h-entry or h-cite, represents a universally unique identifier.
// Attached to timestamp link.
//
// u-url
// Within h-entry or h-cite, represents the status permalink. Attached to
// timestamp link. Within h-card, represents the profile permalink.
// Attached to display name link.
//
// Datetime properties (dt-*)
//
// dt-published
// Within h-entry or h-cite, represents the date and time at which the
// status was published. Attached to data element with value attribute.
//
// Element tree (e-*)
//
// e-content
// Within h-entry or h-cite, represents the content of the status. Attached to status content.
//
// I learned after writing this that mastodon also has RSS feeds
// e.g. https://tiny.tilde.website/@magical.rss
type MastoSource struct {
Items []*MastoItem
Title string
URL string
LastFetch time.Time
Error error
LastStatusCode int
LastStatus string
mu sync.Mutex
}
var _ Source = &MastoSource{}
type MastoFeed struct {
Title string
Items []*MastoItem
}
type MastoItem struct {
Title string `json:"title,omitempty"`
Content string `json:"content,omitempty"`
Link string `json:"link,omitempty"`
PublishedString string `json:"published,omitempty"`
Author string `json:"author,omitempty"`
IsBoost bool `json:"is_boost,omitempty"`
}
func NewMastoSource(url string) *MastoSource {
return &MastoSource{
URL: url,
}
}
func (src *MastoSource) update(ctx context.Context) {
src.mu.Lock()
defer src.mu.Unlock()
req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil)
if err != nil {
src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err)
log.Println(src.Error)
return // return err?
}
req.Header.Set("User-Agent", UserAgent)
// TODO: If-Modified-Since, Etag
resp, err := http.DefaultClient.Do(req)
if err != nil {
err := fmt.Errorf("error fetching %q: %w", src.URL, err)
log.Println(err)
src.Error = err
return // return err?
}
if resp != nil && resp.Body != nil {
defer func() {
err := resp.Body.Close()
if err != nil {
log.Printf("error closing response body for %q: %v", src.URL, err)
}
}()
}
src.LastStatusCode = resp.StatusCode
src.LastStatus = resp.Status
if resp.StatusCode != 200 {
src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status)
log.Println(src.Error)
return
}
// TODO: cache body
body := MaxBytesReader(resp.Body, maxResponseSize)
feed, err := parseMicroformats(body)
if err != nil {
err := fmt.Errorf("error parsing %q: %w", src.URL, err)
log.Println(err)
src.Error = err
return // return err?
}
items := feed.Items
sort.Slice(items, func(i, j int) bool {
return items[i].PublishedString >= items[j].PublishedString
})
src.Title = feed.Title
src.Items = items
src.LastFetch = time.Now()
src.Error = nil
}
func parseMicroformats(r io.Reader) (*MastoFeed, error) {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil, err
}
feed := new(MastoFeed)
if doc.Find(".h-feed").Length() == 0 {
return nil, fmt.Errorf("no feed content found")
}
feed.Title = doc.Find(".h-feed > .p-name").First().AttrOr("value", "")
doc.Find(".h-feed").Find(".h-entry, .h-cite").Each(func(i int, elem *goquery.Selection) {
cw := strings.TrimSpace(text(elem.Find(".p-summary").First()))
// TODO: move this logic to GetItems
if cw != "" {
cw = "[" + cw + "] "
}
feed.Items = append(feed.Items, &MastoItem{
Title: "",
Content: cw + text(elem.Find(".e-content").First()),
Link: elem.Find("a.u-url.u-uid").AttrOr("href", ""),
Author: text(elem.Find(".p-author .p-name").First()),
PublishedString: elem.Find("data.dt-published").AttrOr("value", ""),
IsBoost: elem.HasClass("h-cite"),
})
})
return feed, nil
}
// Text gets the combined text contents of each element in the set of matched
// elements, including their descendants.
func text(s *goquery.Selection) string {
var buf bytes.Buffer
// Slightly optimized vs calling Each: no single selection object created
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
// Keep newlines and spaces, like jQuery
buf.WriteString(n.Data)
} else if n.Type == html.ElementNode && n.DataAtom == atom.Br {
//buf.WriteString("\n")
buf.WriteString(" ")
} else if n.Type == html.ElementNode && n.DataAtom == atom.P && n.PrevSibling != nil {
//buf.WriteString("\n\n")
buf.WriteString(" ")
} else if n.Type == html.ElementNode && (n.DataAtom == atom.Script || n.DataAtom == atom.Style || n.DataAtom == atom.Template) {
// nothing
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
}
for _, n := range s.Nodes {
f(n)
}
return buf.String()
}
func (src *MastoSource) GetTitle() string { return src.Title }
func (src *MastoSource) GetError() error { return src.Error }
func (src *MastoSource) GetItems() (items []Item) {
for _, x := range src.Items {
text := x.Content
if x.IsBoost {
text = "RT @" + x.Author + ": " + text
}
d, _ := time.Parse(time.RFC3339, x.PublishedString)
items = append(items, Item{
Date: d,
Link: x.Link,
Text: text,
})
}
return
}