package main import ( "encoding/xml" "fmt" "strings" "time" ) // RSS structs for parsing type RSS struct { Channel RSSChannel `xml:"channel"` } type RSSChannel struct { Title string `xml:"title"` Link string `xml:"link"` Description string `xml:"description"` Language string `xml:"language"` LastBuildDate string `xml:"lastBuildDate"` PubDate string `xml:"pubDate"` TTL int `xml:"ttl"` UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"` UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"` Items []RSSItem `xml:"item"` } type RSSItem struct { Title string `xml:"title"` Link string `xml:"link"` GUID string `xml:"guid"` Description string `xml:"description"` Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` Author string `xml:"author"` Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` PubDate string `xml:"pubDate"` } // Atom structs for parsing type AtomFeed struct { Title string `xml:"title"` Link []AtomLink `xml:"link"` Updated string `xml:"updated"` Entries []AtomEntry `xml:"entry"` } type AtomEntry struct { ID string `xml:"id"` Title string `xml:"title"` Links []AtomLink `xml:"link"` Summary string `xml:"summary"` Content AtomContent `xml:"content"` Author AtomAuthor `xml:"author"` Updated string `xml:"updated"` Published string `xml:"published"` } type AtomContent struct { Type string `xml:"type,attr"` Value string `xml:",chardata"` } type AtomAuthor struct { Name string `xml:"name"` } type AtomLink struct { Href string `xml:"href,attr"` Rel string `xml:"rel,attr"` Type string `xml:"type,attr"` } func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { var rss RSS if err := xml.Unmarshal([]byte(body), &rss); err != nil { return nil } ch := rss.Channel feed.Title = ch.Title feed.Description = ch.Description feed.Language = ch.Language feed.SiteURL = normalizeURL(ch.Link) feed.TTLMinutes = ch.TTL feed.UpdatePeriod = ch.UpdatePeriod feed.UpdateFreq = ch.UpdateFreq feed.ItemCount = len(ch.Items) // Parse lastBuildDate if ch.LastBuildDate != "" { if t, err := parseRSSDate(ch.LastBuildDate); err == nil { feed.LastBuildDate = t } } // Parse items now := time.Now() var items []*Item var dates []time.Time for _, rssItem := range ch.Items { item := &Item{ FeedURL: feed.URL, Title: rssItem.Title, Link: rssItem.Link, Description: rssItem.Description, Content: rssItem.Content, DiscoveredAt: now, } // Use GUID if available, otherwise use link if rssItem.GUID != "" { item.GUID = rssItem.GUID } else if rssItem.Link != "" { item.GUID = rssItem.Link } // Author: prefer author, fall back to dc:creator if rssItem.Author != "" { item.Author = rssItem.Author } else if rssItem.Creator != "" { item.Author = rssItem.Creator } // Parse pubDate if rssItem.PubDate != "" { if t, err := parseRSSDate(rssItem.PubDate); err == nil { item.PubDate = t dates = append(dates, t) } } items = append(items, item) } // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { if d.Before(oldest) { oldest = d } if d.After(newest) { newest = d } } feed.OldestItemDate = oldest feed.NewestItemDate = newest if len(dates) > 1 { totalHours := newest.Sub(oldest).Hours() feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) } } return items } func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item { var atom AtomFeed if err := xml.Unmarshal([]byte(body), &atom); err != nil { return nil } feed.Title = atom.Title feed.ItemCount = len(atom.Entries) // Get site URL from links for _, link := range atom.Link { if link.Rel == "" || link.Rel == "alternate" { if link.Type == "" || strings.Contains(link.Type, "html") { feed.SiteURL = normalizeURL(link.Href) break } } } // Parse updated date if atom.Updated != "" { if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil { feed.LastBuildDate = t } } // Parse entries now := time.Now() var items []*Item var dates []time.Time for _, entry := range atom.Entries { item := &Item{ FeedURL: feed.URL, Title: entry.Title, Author: entry.Author.Name, DiscoveredAt: now, } // Use ID as GUID if entry.ID != "" { item.GUID = entry.ID } // Get link (prefer alternate, fall back to first link) for _, link := range entry.Links { if link.Rel == "" || link.Rel == "alternate" { item.Link = link.Href break } } if item.Link == "" && len(entry.Links) > 0 { item.Link = entry.Links[0].Href } // Use ID as GUID fallback if not set if item.GUID == "" && item.Link != "" { item.GUID = item.Link } // Summary/Content item.Description = entry.Summary item.Content = entry.Content.Value // Parse dates dateStr := entry.Updated if dateStr == "" { dateStr = entry.Published } if dateStr != "" { if t, err := time.Parse(time.RFC3339, dateStr); err == nil { item.PubDate = t dates = append(dates, t) } } items = append(items, item) } // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { if d.Before(oldest) { oldest = d } if d.After(newest) { newest = d } } feed.OldestItemDate = oldest feed.NewestItemDate = newest if len(dates) > 1 { totalHours := newest.Sub(oldest).Hours() feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) } } return items } // parseRSSDate attempts to parse various RSS date formats func parseRSSDate(s string) (time.Time, error) { formats := []string{ time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822, time.RFC3339, "Mon, 2 Jan 2006 15:04:05 -0700", "2006-01-02T15:04:05-07:00", "2006-01-02 15:04:05", } for _, format := range formats { if t, err := time.Parse(format, s); err == nil { return t, nil } } return time.Time{}, fmt.Errorf("unable to parse date: %s", s) } // calculateNextCrawl determines when to next crawl this feed func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time { now := time.Now() // If TTL is specified, use it if feed.TTLMinutes > 0 { return now.Add(time.Duration(feed.TTLMinutes) * time.Minute) } // If updatePeriod is specified if feed.UpdatePeriod != "" { freq := feed.UpdateFreq if freq == 0 { freq = 1 } switch strings.ToLower(feed.UpdatePeriod) { case "hourly": return now.Add(time.Duration(freq) * time.Hour) case "daily": return now.Add(time.Duration(freq) * 24 * time.Hour) case "weekly": return now.Add(time.Duration(freq) * 7 * 24 * time.Hour) case "monthly": return now.Add(time.Duration(freq) * 30 * 24 * time.Hour) case "yearly": return now.Add(time.Duration(freq) * 365 * 24 * time.Hour) } } // If we have average post frequency, use that if feed.AvgPostFreqHrs > 0 { // Crawl at half the average frequency, but at least every hour and at most once per day crawlInterval := feed.AvgPostFreqHrs / 2 if crawlInterval < 1 { crawlInterval = 1 } if crawlInterval > 24 { crawlInterval = 24 } return now.Add(time.Duration(crawlInterval * float64(time.Hour))) } // Default: crawl every 6 hours return now.Add(6 * time.Hour) }