Files
crawler/parser.go
primal ad78c1a4c0 Add JSON Feed support
- Detect JSON Feed format (jsonfeed.org) via version field
- Parse JSON Feed metadata and items
- Support application/feed+json MIME type for feed discovery
- Include "json" as valid feed type (not auto-denied)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 13:16:50 -05:00

640 lines
16 KiB
Go

package main
import (
"encoding/json"
"encoding/xml"
"fmt"
"regexp"
"strings"
"time"
)
// RSS structs for parsing
type RSS struct {
Channel RSSChannel `xml:"channel"`
}
type RSSChannel struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
Language string `xml:"language"`
LastBuildDate string `xml:"lastBuildDate"`
PubDate string `xml:"pubDate"`
TTL int `xml:"ttl"`
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
Items []RSSItem `xml:"item"`
// iTunes podcast namespace
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Enclosure *RSSEnclosure `xml:"enclosure"`
// iTunes item elements
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
// Media RSS elements
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
}
// MediaContent represents a media:content element
type MediaContent struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Medium string `xml:"medium,attr"` // image, video, audio
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
// MediaThumbnail represents a media:thumbnail element
type MediaThumbnail struct {
URL string `xml:"url,attr"`
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
type RSSEnclosure struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Length int64 `xml:"length,attr"`
}
// Atom structs for parsing
type AtomFeed struct {
Title string `xml:"title"`
Link []AtomLink `xml:"link"`
Updated string `xml:"updated"`
Entries []AtomEntry `xml:"entry"`
}
type AtomEntry struct {
ID string `xml:"id"`
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
}
type AtomContent struct {
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type AtomAuthor struct {
Name string `xml:"name"`
}
type AtomLink struct {
Href string `xml:"href,attr"`
Rel string `xml:"rel,attr"`
Type string `xml:"type,attr"`
}
// isPodcast checks if an RSS feed is a podcast based on content
func isPodcast(ch RSSChannel) bool {
// Check for iTunes namespace elements at channel level
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
ch.ITunesExplicit != "" || ch.ITunesType != "" {
return true
}
// Check items for audio enclosures or iTunes elements
audioCount := 0
for _, item := range ch.Items {
// Check for iTunes duration or episode number
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
return true
}
// Check for audio/video enclosure
if item.Enclosure != nil && item.Enclosure.URL != "" {
mimeType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(mimeType, "audio/") ||
strings.HasPrefix(mimeType, "video/") ||
strings.Contains(mimeType, "mpeg") ||
strings.Contains(mimeType, "mp3") ||
strings.Contains(mimeType, "mp4") ||
strings.Contains(mimeType, "m4a") ||
strings.Contains(mimeType, "ogg") {
audioCount++
}
}
}
// If more than half the items have audio enclosures, it's a podcast
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
return true
}
return false
}
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
return nil
}
ch := rss.Channel
feed.Title = ch.Title
feed.Description = ch.Description
feed.Language = ch.Language
feed.SiteURL = normalizeURL(ch.Link)
feed.TTLMinutes = ch.TTL
feed.UpdatePeriod = ch.UpdatePeriod
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
// Detect podcast
if isPodcast(ch) {
feed.Category = "podcast"
}
// Parse lastBuildDate
if ch.LastBuildDate != "" {
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
feed.LastBuildDate = t
}
}
// Parse items
now := time.Now()
var items []*Item
var dates []time.Time
for _, rssItem := range ch.Items {
item := &Item{
FeedURL: feed.URL,
Title: rssItem.Title,
Link: rssItem.Link,
Description: rssItem.Description,
Content: rssItem.Content,
DiscoveredAt: now,
}
// Use GUID if available, otherwise use link
if rssItem.GUID != "" {
item.GUID = rssItem.GUID
} else if rssItem.Link != "" {
item.GUID = rssItem.Link
}
// Author: prefer author, fall back to dc:creator
if rssItem.Author != "" {
item.Author = rssItem.Author
} else if rssItem.Creator != "" {
item.Author = rssItem.Creator
}
// Parse pubDate
if rssItem.PubDate != "" {
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
// Map enclosure
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
item.Enclosure = &Enclosure{
URL: rssItem.Enclosure.URL,
Type: rssItem.Enclosure.Type,
Length: rssItem.Enclosure.Length,
}
}
// Extract images from various sources
item.ImageURLs = extractItemImages(rssItem)
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
var atom AtomFeed
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
return nil
}
feed.Title = atom.Title
feed.ItemCount = len(atom.Entries)
// Get site URL from links
for _, link := range atom.Link {
if link.Rel == "" || link.Rel == "alternate" {
if link.Type == "" || strings.Contains(link.Type, "html") {
feed.SiteURL = normalizeURL(link.Href)
break
}
}
}
// Parse updated date
if atom.Updated != "" {
if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil {
feed.LastBuildDate = t
}
}
// Parse entries
now := time.Now()
var items []*Item
var dates []time.Time
for _, entry := range atom.Entries {
item := &Item{
FeedURL: feed.URL,
Title: entry.Title,
Author: entry.Author.Name,
DiscoveredAt: now,
}
// Use ID as GUID
if entry.ID != "" {
item.GUID = entry.ID
}
// Get link (prefer alternate, fall back to first link)
for _, link := range entry.Links {
if link.Rel == "" || link.Rel == "alternate" {
item.Link = link.Href
break
}
}
if item.Link == "" && len(entry.Links) > 0 {
item.Link = entry.Links[0].Href
}
// Use ID as GUID fallback if not set
if item.GUID == "" && item.Link != "" {
item.GUID = item.Link
}
// Summary/Content
item.Description = entry.Summary
item.Content = entry.Content.Value
// Parse dates
dateStr := entry.Updated
if dateStr == "" {
dateStr = entry.Published
}
if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}
// parseRSSDate attempts to parse various RSS date formats
func parseRSSDate(s string) (time.Time, error) {
formats := []string{
time.RFC1123Z,
time.RFC1123,
time.RFC822Z,
time.RFC822,
time.RFC3339,
"Mon, 2 Jan 2006 15:04:05 -0700",
"2006-01-02T15:04:05-07:00",
"2006-01-02 15:04:05",
}
for _, format := range formats {
if t, err := time.Parse(format, s); err == nil {
return t, nil
}
}
return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
}
// calculateNextCrawl determines when to next crawl this feed
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
now := time.Now()
// If TTL is specified, use it
if feed.TTLMinutes > 0 {
return now.Add(time.Duration(feed.TTLMinutes) * time.Minute)
}
// If updatePeriod is specified
if feed.UpdatePeriod != "" {
freq := feed.UpdateFreq
if freq == 0 {
freq = 1
}
switch strings.ToLower(feed.UpdatePeriod) {
case "hourly":
return now.Add(time.Duration(freq) * time.Hour)
case "daily":
return now.Add(time.Duration(freq) * 24 * time.Hour)
case "weekly":
return now.Add(time.Duration(freq) * 7 * 24 * time.Hour)
case "monthly":
return now.Add(time.Duration(freq) * 30 * 24 * time.Hour)
case "yearly":
return now.Add(time.Duration(freq) * 365 * 24 * time.Hour)
}
}
// If we have average post frequency, use that
if feed.AvgPostFreqHrs > 0 {
// Crawl at half the average frequency, but at least every hour and at most once per day
crawlInterval := feed.AvgPostFreqHrs / 2
if crawlInterval < 1 {
crawlInterval = 1
}
if crawlInterval > 24 {
crawlInterval = 24
}
return now.Add(time.Duration(crawlInterval * float64(time.Hour)))
}
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
}
// extractItemImages extracts image URLs from an RSS item
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
func extractItemImages(rssItem RSSItem) []string {
seen := make(map[string]bool)
var images []string
addImage := func(url string) {
url = strings.TrimSpace(url)
if url == "" || seen[url] {
return
}
// Basic validation
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
return
}
seen[url] = true
images = append(images, url)
}
// 1. Media RSS content (prefer larger images)
for _, mc := range rssItem.MediaContent {
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
addImage(mc.URL)
}
}
// 2. Media RSS thumbnails
for _, mt := range rssItem.MediaThumbnail {
if mt.URL != "" {
addImage(mt.URL)
}
}
// 3. iTunes image
if rssItem.ITunesImage != "" {
addImage(rssItem.ITunesImage)
}
// 4. Image enclosure
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
addImage(rssItem.Enclosure.URL)
}
// 5. Extract <img> tags from description and content
htmlImages := extractImgTags(rssItem.Description)
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
for _, img := range htmlImages {
addImage(img)
}
return images
}
// extractImgTags extracts src URLs from <img> tags in HTML
func extractImgTags(html string) []string {
if html == "" {
return nil
}
var urls []string
// Simple regex to find img src attributes
// Matches: src="..." or src='...'
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
matches := imgRegex.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) > 1 {
url := strings.TrimSpace(match[1])
// Skip data URIs, tracking pixels, and tiny images
if strings.HasPrefix(url, "data:") {
continue
}
// Skip common tracking/spacer images
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
continue
}
urls = append(urls, url)
}
}
return urls
}
// JSON Feed structs (https://jsonfeed.org/version/1.1)
type JSONFeed struct {
Version string `json:"version"`
Title string `json:"title"`
HomePageURL string `json:"home_page_url"`
FeedURL string `json:"feed_url"`
Description string `json:"description"`
Language string `json:"language"`
Items []JSONFeedItem `json:"items"`
}
type JSONFeedItem struct {
ID string `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
ContentHTML string `json:"content_html"`
ContentText string `json:"content_text"`
Summary string `json:"summary"`
Image string `json:"image"`
DatePublished string `json:"date_published"`
DateModified string `json:"date_modified"`
Authors []JSONFeedAuthor `json:"authors"`
Attachments []JSONFeedAttachment `json:"attachments"`
}
type JSONFeedAuthor struct {
Name string `json:"name"`
URL string `json:"url"`
}
type JSONFeedAttachment struct {
URL string `json:"url"`
MimeType string `json:"mime_type"`
Size int64 `json:"size_in_bytes"`
}
func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
var jf JSONFeed
if err := json.Unmarshal([]byte(body), &jf); err != nil {
return nil
}
feed.Title = jf.Title
feed.Description = jf.Description
feed.Language = jf.Language
feed.SiteURL = normalizeURL(jf.HomePageURL)
feed.ItemCount = len(jf.Items)
// Parse items
now := time.Now()
var items []*Item
var dates []time.Time
for _, ji := range jf.Items {
item := &Item{
FeedURL: feed.URL,
Title: ji.Title,
Link: ji.URL,
DiscoveredAt: now,
}
// Use ID as GUID, fall back to URL
if ji.ID != "" {
item.GUID = ji.ID
} else if ji.URL != "" {
item.GUID = ji.URL
}
// Content: prefer HTML, fall back to text
if ji.ContentHTML != "" {
item.Content = ji.ContentHTML
} else if ji.ContentText != "" {
item.Content = ji.ContentText
}
item.Description = ji.Summary
// Author
if len(ji.Authors) > 0 {
item.Author = ji.Authors[0].Name
}
// Parse date
dateStr := ji.DatePublished
if dateStr == "" {
dateStr = ji.DateModified
}
if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
// Images
if ji.Image != "" {
item.ImageURLs = []string{ji.Image}
}
// Attachments (enclosures)
for _, att := range ji.Attachments {
if att.URL != "" {
item.Enclosure = &Enclosure{
URL: att.URL,
Type: att.MimeType,
Length: att.Size,
}
break // Only use first attachment as enclosure
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}