Files
crawler/parser.go
2026-01-26 16:02:05 -05:00

327 lines
7.4 KiB
Go

package main
import (
"encoding/xml"
"fmt"
"strings"
"time"
)
// RSS structs for parsing
type RSS struct {
Channel RSSChannel `xml:"channel"`
}
type RSSChannel struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
Language string `xml:"language"`
LastBuildDate string `xml:"lastBuildDate"`
PubDate string `xml:"pubDate"`
TTL int `xml:"ttl"`
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
Items []RSSItem `xml:"item"`
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
}
// Atom structs for parsing
type AtomFeed struct {
Title string `xml:"title"`
Link []AtomLink `xml:"link"`
Updated string `xml:"updated"`
Entries []AtomEntry `xml:"entry"`
}
type AtomEntry struct {
ID string `xml:"id"`
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
}
type AtomContent struct {
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type AtomAuthor struct {
Name string `xml:"name"`
}
type AtomLink struct {
Href string `xml:"href,attr"`
Rel string `xml:"rel,attr"`
Type string `xml:"type,attr"`
}
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
return nil
}
ch := rss.Channel
feed.Title = ch.Title
feed.Description = ch.Description
feed.Language = ch.Language
feed.SiteURL = normalizeURL(ch.Link)
feed.TTLMinutes = ch.TTL
feed.UpdatePeriod = ch.UpdatePeriod
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
// Parse lastBuildDate
if ch.LastBuildDate != "" {
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
feed.LastBuildDate = t
}
}
// Parse items
now := time.Now()
var items []*Item
var dates []time.Time
for _, rssItem := range ch.Items {
item := &Item{
FeedURL: feed.URL,
Title: rssItem.Title,
Link: rssItem.Link,
Description: rssItem.Description,
Content: rssItem.Content,
DiscoveredAt: now,
}
// Use GUID if available, otherwise use link
if rssItem.GUID != "" {
item.GUID = rssItem.GUID
} else if rssItem.Link != "" {
item.GUID = rssItem.Link
}
// Author: prefer author, fall back to dc:creator
if rssItem.Author != "" {
item.Author = rssItem.Author
} else if rssItem.Creator != "" {
item.Author = rssItem.Creator
}
// Parse pubDate
if rssItem.PubDate != "" {
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
var atom AtomFeed
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
return nil
}
feed.Title = atom.Title
feed.ItemCount = len(atom.Entries)
// Get site URL from links
for _, link := range atom.Link {
if link.Rel == "" || link.Rel == "alternate" {
if link.Type == "" || strings.Contains(link.Type, "html") {
feed.SiteURL = normalizeURL(link.Href)
break
}
}
}
// Parse updated date
if atom.Updated != "" {
if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil {
feed.LastBuildDate = t
}
}
// Parse entries
now := time.Now()
var items []*Item
var dates []time.Time
for _, entry := range atom.Entries {
item := &Item{
FeedURL: feed.URL,
Title: entry.Title,
Author: entry.Author.Name,
DiscoveredAt: now,
}
// Use ID as GUID
if entry.ID != "" {
item.GUID = entry.ID
}
// Get link (prefer alternate, fall back to first link)
for _, link := range entry.Links {
if link.Rel == "" || link.Rel == "alternate" {
item.Link = link.Href
break
}
}
if item.Link == "" && len(entry.Links) > 0 {
item.Link = entry.Links[0].Href
}
// Use ID as GUID fallback if not set
if item.GUID == "" && item.Link != "" {
item.GUID = item.Link
}
// Summary/Content
item.Description = entry.Summary
item.Content = entry.Content.Value
// Parse dates
dateStr := entry.Updated
if dateStr == "" {
dateStr = entry.Published
}
if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
// parseRSSDate attempts to parse various RSS date formats
func parseRSSDate(s string) (time.Time, error) {
formats := []string{
time.RFC1123Z,
time.RFC1123,
time.RFC822Z,
time.RFC822,
time.RFC3339,
"Mon, 2 Jan 2006 15:04:05 -0700",
"2006-01-02T15:04:05-07:00",
"2006-01-02 15:04:05",
}
for _, format := range formats {
if t, err := time.Parse(format, s); err == nil {
return t, nil
}
}
return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
}
// calculateNextCrawl determines when to next crawl this feed
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
now := time.Now()
// If TTL is specified, use it
if feed.TTLMinutes > 0 {
return now.Add(time.Duration(feed.TTLMinutes) * time.Minute)
}
// If updatePeriod is specified
if feed.UpdatePeriod != "" {
freq := feed.UpdateFreq
if freq == 0 {
freq = 1
}
switch strings.ToLower(feed.UpdatePeriod) {
case "hourly":
return now.Add(time.Duration(freq) * time.Hour)
case "daily":
return now.Add(time.Duration(freq) * 24 * time.Hour)
case "weekly":
return now.Add(time.Duration(freq) * 7 * 24 * time.Hour)
case "monthly":
return now.Add(time.Duration(freq) * 30 * 24 * time.Hour)
case "yearly":
return now.Add(time.Duration(freq) * 365 * 24 * time.Hour)
}
}
// If we have average post frequency, use that
if feed.AvgPostFreqHrs > 0 {
// Crawl at half the average frequency, but at least every hour and at most once per day
crawlInterval := feed.AvgPostFreqHrs / 2
if crawlInterval < 1 {
crawlInterval = 1
}
if crawlInterval > 24 {
crawlInterval = 24
}
return now.Add(time.Duration(crawlInterval * float64(time.Hour)))
}
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
}