feedget/main.go

161 lines
3.3 KiB
Go
Raw Normal View History

2021-09-30 00:23:50 +00:00
// Feedget scrapes RSS feeds (and other sources)
// and spits the latest headline from each onto a static web page.
package main
2021-11-15 03:32:24 +00:00
import (
"context"
"fmt"
"log"
"net/http"
2021-11-15 03:32:24 +00:00
"sort"
"sync"
"time"
"github.com/mmcdole/gofeed"
)
const UserAgent = "feedget/0.1"
2021-11-15 03:32:24 +00:00
func main() {
2021-12-31 06:20:09 +00:00
var sources = []*FeedSource{ // TODO: interface Source
NewFeed("https://tilde.team/~dozens/dreams/rss.xml"),
NewFeed("https://tilde.town/~magical/xkcd.xml"), // "https://xkcd.com/atom.xml",
2021-12-31 22:28:06 +00:00
//NewFeed("https://tilde.town/~magical/404.xml"),
2021-11-15 03:32:24 +00:00
}
2021-12-31 09:03:45 +00:00
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
2021-11-15 03:32:24 +00:00
var wg sync.WaitGroup
wg.Add(len(sources))
for i := range sources {
src := sources[i]
go func() {
2021-12-31 09:03:45 +00:00
src.update(ctx)
2021-11-15 03:32:24 +00:00
wg.Done()
}()
}
wg.Wait()
for _, src := range sources {
fmt.Println(src.Title, src.Error, src.LastStatus)
2021-12-31 09:03:45 +00:00
for i, x := range src.Items {
if i > 5 {
break
}
fmt.Println("\t", x.PublishedParsed.Format(time.Stamp), x.Title)
}
2021-11-15 03:32:24 +00:00
}
2021-12-31 08:48:18 +00:00
src := NewMastoSource("https://tilde.town/~magical/masto_test.html")
2021-12-31 09:03:45 +00:00
src.update(ctx)
2021-12-31 08:48:18 +00:00
fmt.Println(src.Title, src.Error, src.LastStatus)
2021-12-31 09:03:45 +00:00
for i, x := range src.Items {
if i > 5 {
2021-12-31 22:28:06 +00:00
//break
}
auth := ""
if x.IsBoost {
auth = "RT @" + x.Author + ": "
2021-12-31 09:03:45 +00:00
}
2021-12-31 08:48:18 +00:00
d, _ := time.Parse(time.RFC3339, x.PublishedString)
2021-12-31 22:28:06 +00:00
fmt.Println("\t", d.Format(time.Stamp), auth+x.Content)
2021-12-31 08:48:18 +00:00
}
2021-11-15 03:32:24 +00:00
}
type Source interface {
Title() string
Link() string
Error() error
Update(context.Context)
}
// want to keep track of:
// - whether the most recent update suceeded
// - when the last successful update was
// - how many of the last N updates succeeded
// - status codes for the last N updates
// - response time for the last N updates
// - how frequently items are posted
type Cache struct {
}
type FeedSource struct {
Items []*gofeed.Item
Title string
URL string
LastFetch time.Time
Error error
LastStatusCode int
LastStatus string
2021-11-15 03:32:24 +00:00
mu sync.Mutex
}
func NewFeed(url string) *FeedSource {
return &FeedSource{
URL: url,
}
}
2021-12-31 09:03:45 +00:00
func (src *FeedSource) update(ctx context.Context) {
2021-11-15 03:32:24 +00:00
src.mu.Lock()
defer src.mu.Unlock()
fp := gofeed.NewParser()
2021-12-31 22:25:30 +00:00
req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil)
2021-11-15 03:32:24 +00:00
if err != nil {
src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err)
log.Println(src.Error)
return // return err?
}
req.Header.Set("User-Agent", UserAgent)
// TODO: If-Modified-Since, Etag
resp, err := http.DefaultClient.Do(req)
if err != nil {
err := fmt.Errorf("error fetching %q: %w", src.URL, err)
2021-11-15 03:32:24 +00:00
log.Println(err)
src.Error = err
return // return err?
}
if resp != nil && resp.Body != nil {
defer func() {
err := resp.Body.Close()
if err != nil {
log.Printf("error closing response body for %q: %v", src.URL, err)
}
}()
}
src.LastStatusCode = resp.StatusCode
src.LastStatus = resp.Status
if resp.StatusCode != 200 {
src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status)
log.Println(src.Error)
return
}
// TODO: cache body
feed, err := fp.Parse(resp.Body)
if err != nil {
err := fmt.Errorf("error parsing %q: %w", src.URL, err)
log.Println(err)
src.Error = err
return // return err?
}
2021-11-15 03:32:24 +00:00
items := feed.Items
sort.Slice(items, func(i, j int) bool {
return items[i].Updated >= items[j].Updated
})
src.Title = feed.Title
src.Items = items
src.LastFetch = time.Now()
src.Error = nil
2021-11-15 03:32:24 +00:00
}