2021-09-30 00:23:50 +00:00
|
|
|
// Feedget scrapes RSS feeds (and other sources)
|
|
|
|
// and spits the latest headline from each onto a static web page.
|
|
|
|
package main
|
2021-11-15 03:32:24 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"log"
|
2021-12-31 06:20:56 +00:00
|
|
|
"net/http"
|
2021-11-15 03:32:24 +00:00
|
|
|
"sort"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/mmcdole/gofeed"
|
|
|
|
)
|
|
|
|
|
2021-12-31 06:20:56 +00:00
|
|
|
const UserAgent = "feedget/0.1"
|
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
func main() {
|
2022-01-01 00:52:03 +00:00
|
|
|
var sources = []Source{
|
2021-12-31 06:20:09 +00:00
|
|
|
NewFeed("https://tilde.team/~dozens/dreams/rss.xml"),
|
|
|
|
NewFeed("https://tilde.town/~magical/xkcd.xml"), // "https://xkcd.com/atom.xml",
|
2021-12-31 22:28:06 +00:00
|
|
|
//NewFeed("https://tilde.town/~magical/404.xml"),
|
2022-01-01 00:52:03 +00:00
|
|
|
NewMastoSource("https://tilde.town/~magical/masto_test.html"),
|
2021-11-15 03:32:24 +00:00
|
|
|
}
|
|
|
|
|
2021-12-31 09:03:45 +00:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
|
|
defer cancel()
|
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
var wg sync.WaitGroup
|
|
|
|
wg.Add(len(sources))
|
|
|
|
for i := range sources {
|
|
|
|
src := sources[i]
|
|
|
|
go func() {
|
2021-12-31 09:03:45 +00:00
|
|
|
src.update(ctx)
|
2021-11-15 03:32:24 +00:00
|
|
|
wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
|
|
|
|
for _, src := range sources {
|
2022-01-01 00:52:03 +00:00
|
|
|
fmt.Println(src.GetTitle(), src.GetError())
|
|
|
|
for i, x := range src.GetItems() {
|
2021-12-31 09:03:45 +00:00
|
|
|
if i > 5 {
|
2022-01-01 00:52:03 +00:00
|
|
|
//break
|
2021-12-31 09:03:45 +00:00
|
|
|
}
|
2022-01-01 00:52:03 +00:00
|
|
|
fmt.Println("\t", x.Date.Format("2006 Jan _2 15:04"), x.Text)
|
2021-12-31 09:03:45 +00:00
|
|
|
}
|
2021-11-15 03:32:24 +00:00
|
|
|
}
|
2021-12-31 08:48:18 +00:00
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type Source interface {
|
2022-01-01 00:52:03 +00:00
|
|
|
GetTitle() string
|
|
|
|
//GetLink() string
|
|
|
|
GetError() error
|
|
|
|
GetItems() []Item
|
|
|
|
update(context.Context)
|
|
|
|
}
|
|
|
|
|
|
|
|
type Item struct {
|
|
|
|
Date time.Time
|
|
|
|
Link string
|
|
|
|
Text string
|
2021-11-15 03:32:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// want to keep track of:
|
|
|
|
// - whether the most recent update suceeded
|
|
|
|
// - when the last successful update was
|
|
|
|
// - how many of the last N updates succeeded
|
|
|
|
// - status codes for the last N updates
|
|
|
|
// - response time for the last N updates
|
|
|
|
// - how frequently items are posted
|
|
|
|
|
|
|
|
type Cache struct {
|
|
|
|
}
|
|
|
|
|
|
|
|
type FeedSource struct {
|
|
|
|
Items []*gofeed.Item
|
|
|
|
Title string
|
|
|
|
URL string
|
|
|
|
LastFetch time.Time
|
|
|
|
Error error
|
|
|
|
|
2021-12-31 06:20:56 +00:00
|
|
|
LastStatusCode int
|
|
|
|
LastStatus string
|
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
mu sync.Mutex
|
|
|
|
}
|
|
|
|
|
2022-01-01 00:52:03 +00:00
|
|
|
var _ Source = &FeedSource{}
|
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
func NewFeed(url string) *FeedSource {
|
|
|
|
return &FeedSource{
|
|
|
|
URL: url,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-31 09:03:45 +00:00
|
|
|
func (src *FeedSource) update(ctx context.Context) {
|
2021-11-15 03:32:24 +00:00
|
|
|
src.mu.Lock()
|
|
|
|
defer src.mu.Unlock()
|
|
|
|
fp := gofeed.NewParser()
|
2021-12-31 06:20:56 +00:00
|
|
|
|
2021-12-31 22:25:30 +00:00
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", src.URL, nil)
|
2021-11-15 03:32:24 +00:00
|
|
|
if err != nil {
|
2021-12-31 06:20:56 +00:00
|
|
|
src.Error = fmt.Errorf("error fetching %q: %w", src.URL, err)
|
|
|
|
log.Println(src.Error)
|
|
|
|
return // return err?
|
|
|
|
}
|
|
|
|
req.Header.Set("User-Agent", UserAgent)
|
|
|
|
// TODO: If-Modified-Since, Etag
|
|
|
|
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
|
|
if err != nil {
|
|
|
|
err := fmt.Errorf("error fetching %q: %w", src.URL, err)
|
2021-11-15 03:32:24 +00:00
|
|
|
log.Println(err)
|
|
|
|
src.Error = err
|
|
|
|
return // return err?
|
|
|
|
}
|
2021-12-31 06:20:56 +00:00
|
|
|
|
|
|
|
if resp != nil && resp.Body != nil {
|
|
|
|
defer func() {
|
|
|
|
err := resp.Body.Close()
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("error closing response body for %q: %v", src.URL, err)
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
src.LastStatusCode = resp.StatusCode
|
|
|
|
src.LastStatus = resp.Status
|
|
|
|
if resp.StatusCode != 200 {
|
|
|
|
src.Error = fmt.Errorf("error fetching %q: status %s", src.URL, resp.Status)
|
|
|
|
log.Println(src.Error)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: cache body
|
|
|
|
|
|
|
|
feed, err := fp.Parse(resp.Body)
|
|
|
|
if err != nil {
|
|
|
|
err := fmt.Errorf("error parsing %q: %w", src.URL, err)
|
|
|
|
log.Println(err)
|
|
|
|
src.Error = err
|
|
|
|
return // return err?
|
|
|
|
}
|
|
|
|
|
2021-11-15 03:32:24 +00:00
|
|
|
items := feed.Items
|
fix FeedSource item sorting
we were sorting by strings instead of parsed dates and it was getting
confused by RFC1123 dates that looked like
Thu, 02 Dec 2021 00:00:00 -0700
Mon, 20 Sep 2021 00:00:00 -0600
Mon, 06 Sep 2021 00:00:00 -0600
Thu, 02 Sep 2021 00:00:00 -0600
Sun, 22 Aug 2021 00:00:00 -0600
which of course sort like
Thu, 02 Sep 2021 00:00:00 -0600
Thu, 02 Dec 2021 00:00:00 -0700
Sun, 22 Aug 2021 00:00:00 -0600
Mon, 20 Sep 2021 00:00:00 -0600
Mon, 06 Sep 2021 00:00:00 -0600
2022-01-01 00:52:31 +00:00
|
|
|
sort.SliceStable(items, func(i, j int) bool {
|
|
|
|
var d1, d2 time.Time
|
|
|
|
if items[i].PublishedParsed != nil {
|
|
|
|
d1 = *items[i].PublishedParsed
|
|
|
|
} else if items[i].UpdatedParsed != nil {
|
|
|
|
d1 = *items[i].UpdatedParsed
|
|
|
|
}
|
|
|
|
|
|
|
|
if items[j].PublishedParsed != nil {
|
|
|
|
d2 = *items[j].PublishedParsed
|
|
|
|
} else if items[j].UpdatedParsed != nil {
|
|
|
|
d2 = *items[j].UpdatedParsed
|
|
|
|
}
|
|
|
|
return !d1.Before(d2)
|
2021-11-15 03:32:24 +00:00
|
|
|
})
|
|
|
|
src.Title = feed.Title
|
|
|
|
src.Items = items
|
|
|
|
src.LastFetch = time.Now()
|
2021-12-31 06:20:56 +00:00
|
|
|
src.Error = nil
|
2021-11-15 03:32:24 +00:00
|
|
|
}
|
2022-01-01 00:52:03 +00:00
|
|
|
|
|
|
|
func (src *FeedSource) GetTitle() string { return src.Title }
|
|
|
|
func (src *FeedSource) GetError() error { return src.Error }
|
|
|
|
|
|
|
|
func (src *FeedSource) GetItems() (items []Item) {
|
|
|
|
for _, x := range src.Items {
|
|
|
|
d := time.Time{}
|
|
|
|
if x.PublishedParsed != nil {
|
|
|
|
d = *x.PublishedParsed
|
|
|
|
}
|
|
|
|
if x.UpdatedParsed != nil {
|
|
|
|
d = *x.UpdatedParsed
|
|
|
|
}
|
|
|
|
items = append(items, Item{
|
|
|
|
Date: d,
|
|
|
|
Link: x.Link,
|
|
|
|
Text: x.Title,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|