232 lines
5.8 KiB
Go
232 lines
5.8 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
)
|
|
|
|
const maxResponseSize = 1e6 // 1MB
|
|
|
|
// https://docs.joinmastodon.org/spec/microformats/
|
|
|
|
// Root elements (h-*)
|
|
//
|
|
// h-feed
|
|
// Represents a stream of entries. Attached to a profile's toots. Also
|
|
// attached to the parent thread within detailed status views.
|
|
//
|
|
// h-entry
|
|
// Represents episodic or date stamped online content. Attached to a status.
|
|
//
|
|
// URL properties (u-*)
|
|
//
|
|
// u-photo
|
|
// Within h-card, represents the profile picture. Attached to the avatar image.
|
|
//
|
|
// u-uid
|
|
// Within h-entry or h-cite, represents a universally unique identifier.
|
|
// Attached to timestamp link.
|
|
//
|
|
// u-url
|
|
// Within h-entry or h-cite, represents the status permalink. Attached to
|
|
// timestamp link. Within h-card, represents the profile permalink.
|
|
// Attached to display name link.
|
|
//
|
|
// Datetime properties (dt-*)
|
|
//
|
|
// dt-published
|
|
// Within h-entry or h-cite, represents the date and time at which the
|
|
// status was published. Attached to data element with value attribute.
|
|
//
|
|
// Element tree (e-*)
|
|
//
|
|
// e-content
|
|
// Within h-entry or h-cite, represents the content of the status. Attached to status content.
|
|
//
|
|
|
|
type MastoSource struct {
|
|
Items []*MastoItem
|
|
Title string
|
|
URL string
|
|
LastFetch time.Time
|
|
Error error
|
|
|
|
LastStatusCode int
|
|
LastStatus string
|
|
|
|
mu sync.Mutex
|
|
}
|
|
|
|
var _ Source = &MastoSource{}
|
|
|
|
type MastoFeed struct {
|
|
Title string
|
|
Items []*MastoItem
|
|
}
|
|
|
|
type MastoItem struct {
|
|
Title string `json:"title,omitempty"`
|
|
Content string `json:"content,omitempty"`
|
|
Link string `json:"link,omitempty"`
|
|
PublishedString string `json:"published,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
IsBoost bool `json:"is_boost,omitempty"`
|
|
}
|
|
|
|
func NewMastoSource(url string) *MastoSource {
|
|
return &MastoSource{
|
|
URL: url,
|
|
}
|
|
}
|
|
|
|
func (src *MastoSource) update(ctx context.Context) {
|
|
src.mu.Lock()
|
|
defer src.mu.Unlock()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil)
|
|
if err != nil {
|
|
src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err)
|
|
log.Println(src.Error)
|
|
return // return err?
|
|
}
|
|
req.Header.Set("User-Agent", UserAgent)
|
|
// TODO: If-Modified-Since, Etag
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
err := fmt.Errorf("error fetching %q: %w", src.URL, err)
|
|
log.Println(err)
|
|
src.Error = err
|
|
return // return err?
|
|
}
|
|
|
|
if resp != nil && resp.Body != nil {
|
|
defer func() {
|
|
err := resp.Body.Close()
|
|
if err != nil {
|
|
log.Printf("error closing response body for %q: %v", src.URL, err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
src.LastStatusCode = resp.StatusCode
|
|
src.LastStatus = resp.Status
|
|
if resp.StatusCode != 200 {
|
|
src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status)
|
|
log.Println(src.Error)
|
|
return
|
|
}
|
|
|
|
// TODO: cache body
|
|
body := MaxBytesReader(resp.Body, maxResponseSize)
|
|
|
|
feed, err := parseMicroformats(body)
|
|
if err != nil {
|
|
err := fmt.Errorf("error parsing %q: %w", src.URL, err)
|
|
log.Println(err)
|
|
src.Error = err
|
|
return // return err?
|
|
}
|
|
|
|
items := feed.Items
|
|
sort.Slice(items, func(i, j int) bool {
|
|
return items[i].PublishedString >= items[j].PublishedString
|
|
})
|
|
src.Title = feed.Title
|
|
src.Items = items
|
|
src.LastFetch = time.Now()
|
|
src.Error = nil
|
|
}
|
|
|
|
func parseMicroformats(r io.Reader) (*MastoFeed, error) {
|
|
doc, err := goquery.NewDocumentFromReader(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
feed := new(MastoFeed)
|
|
if doc.Find(".h-feed").Length() == 0 {
|
|
return nil, fmt.Errorf("no feed content found")
|
|
}
|
|
feed.Title = doc.Find(".h-feed > .p-name").First().AttrOr("value", "")
|
|
doc.Find(".h-feed").Find(".h-entry, .h-cite").Each(func(i int, elem *goquery.Selection) {
|
|
cw := strings.TrimSpace(text(elem.Find(".p-summary").First()))
|
|
// TODO: move this logic to GetItems
|
|
if cw != "" {
|
|
cw = "[" + cw + "] "
|
|
}
|
|
feed.Items = append(feed.Items, &MastoItem{
|
|
Title: "",
|
|
Content: cw + text(elem.Find(".e-content").First()),
|
|
Link: elem.Find("a.u-url.u-uid").AttrOr("href", ""),
|
|
Author: text(elem.Find(".p-author .p-name").First()),
|
|
PublishedString: elem.Find("data.dt-published").AttrOr("value", ""),
|
|
IsBoost: elem.HasClass("h-cite"),
|
|
})
|
|
})
|
|
return feed, nil
|
|
}
|
|
|
|
// Text gets the combined text contents of each element in the set of matched
|
|
// elements, including their descendants.
|
|
func text(s *goquery.Selection) string {
|
|
var buf bytes.Buffer
|
|
|
|
// Slightly optimized vs calling Each: no single selection object created
|
|
var f func(*html.Node)
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.TextNode {
|
|
// Keep newlines and spaces, like jQuery
|
|
buf.WriteString(n.Data)
|
|
} else if n.Type == html.ElementNode && n.DataAtom == atom.Br {
|
|
//buf.WriteString("\n")
|
|
buf.WriteString(" ")
|
|
} else if n.Type == html.ElementNode && n.DataAtom == atom.P && n.PrevSibling != nil {
|
|
//buf.WriteString("\n\n")
|
|
buf.WriteString(" ")
|
|
} else if n.Type == html.ElementNode && (n.DataAtom == atom.Script || n.DataAtom == atom.Style || n.DataAtom == atom.Template) {
|
|
// nothing
|
|
}
|
|
if n.FirstChild != nil {
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c)
|
|
}
|
|
}
|
|
}
|
|
for _, n := range s.Nodes {
|
|
f(n)
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func (src *MastoSource) GetTitle() string { return src.Title }
|
|
func (src *MastoSource) GetError() error { return src.Error }
|
|
|
|
func (src *MastoSource) GetItems() (items []Item) {
|
|
for _, x := range src.Items {
|
|
text := x.Content
|
|
if x.IsBoost {
|
|
text = "RT @" + x.Author + ": " + text
|
|
}
|
|
d, _ := time.Parse(time.RFC3339, x.PublishedString)
|
|
items = append(items, Item{
|
|
Date: d,
|
|
Link: x.Link,
|
|
Text: text,
|
|
})
|
|
}
|
|
return
|
|
}
|