- Detect JSON Feed format (jsonfeed.org) via version field - Parse JSON Feed metadata and items - Support application/feed+json MIME type for feed discovery - Include "json" as valid feed type (not auto-denied) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
640 lines
16 KiB
Go
640 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// RSS structs for parsing
|
|
type RSS struct {
|
|
Channel RSSChannel `xml:"channel"`
|
|
}
|
|
|
|
type RSSChannel struct {
|
|
Title string `xml:"title"`
|
|
Link string `xml:"link"`
|
|
Description string `xml:"description"`
|
|
Language string `xml:"language"`
|
|
LastBuildDate string `xml:"lastBuildDate"`
|
|
PubDate string `xml:"pubDate"`
|
|
TTL int `xml:"ttl"`
|
|
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
|
|
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
|
|
Items []RSSItem `xml:"item"`
|
|
// iTunes podcast namespace
|
|
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
|
|
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
|
|
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
|
|
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
|
|
}
|
|
|
|
type RSSItem struct {
|
|
Title string `xml:"title"`
|
|
Link string `xml:"link"`
|
|
GUID string `xml:"guid"`
|
|
Description string `xml:"description"`
|
|
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
|
Author string `xml:"author"`
|
|
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
|
PubDate string `xml:"pubDate"`
|
|
Enclosure *RSSEnclosure `xml:"enclosure"`
|
|
// iTunes item elements
|
|
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
|
|
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
|
|
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
|
|
// Media RSS elements
|
|
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
|
|
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
|
|
}
|
|
|
|
// MediaContent represents a media:content element
|
|
type MediaContent struct {
|
|
URL string `xml:"url,attr"`
|
|
Type string `xml:"type,attr"`
|
|
Medium string `xml:"medium,attr"` // image, video, audio
|
|
Width int `xml:"width,attr"`
|
|
Height int `xml:"height,attr"`
|
|
}
|
|
|
|
// MediaThumbnail represents a media:thumbnail element
|
|
type MediaThumbnail struct {
|
|
URL string `xml:"url,attr"`
|
|
Width int `xml:"width,attr"`
|
|
Height int `xml:"height,attr"`
|
|
}
|
|
|
|
type RSSEnclosure struct {
|
|
URL string `xml:"url,attr"`
|
|
Type string `xml:"type,attr"`
|
|
Length int64 `xml:"length,attr"`
|
|
}
|
|
|
|
// Atom structs for parsing
|
|
type AtomFeed struct {
|
|
Title string `xml:"title"`
|
|
Link []AtomLink `xml:"link"`
|
|
Updated string `xml:"updated"`
|
|
Entries []AtomEntry `xml:"entry"`
|
|
}
|
|
|
|
type AtomEntry struct {
|
|
ID string `xml:"id"`
|
|
Title string `xml:"title"`
|
|
Links []AtomLink `xml:"link"`
|
|
Summary string `xml:"summary"`
|
|
Content AtomContent `xml:"content"`
|
|
Author AtomAuthor `xml:"author"`
|
|
Updated string `xml:"updated"`
|
|
Published string `xml:"published"`
|
|
}
|
|
|
|
type AtomContent struct {
|
|
Type string `xml:"type,attr"`
|
|
Value string `xml:",chardata"`
|
|
}
|
|
|
|
type AtomAuthor struct {
|
|
Name string `xml:"name"`
|
|
}
|
|
|
|
type AtomLink struct {
|
|
Href string `xml:"href,attr"`
|
|
Rel string `xml:"rel,attr"`
|
|
Type string `xml:"type,attr"`
|
|
}
|
|
|
|
// isPodcast checks if an RSS feed is a podcast based on content
|
|
func isPodcast(ch RSSChannel) bool {
|
|
// Check for iTunes namespace elements at channel level
|
|
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
|
|
ch.ITunesExplicit != "" || ch.ITunesType != "" {
|
|
return true
|
|
}
|
|
|
|
// Check items for audio enclosures or iTunes elements
|
|
audioCount := 0
|
|
for _, item := range ch.Items {
|
|
// Check for iTunes duration or episode number
|
|
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
|
|
return true
|
|
}
|
|
// Check for audio/video enclosure
|
|
if item.Enclosure != nil && item.Enclosure.URL != "" {
|
|
mimeType := strings.ToLower(item.Enclosure.Type)
|
|
if strings.HasPrefix(mimeType, "audio/") ||
|
|
strings.HasPrefix(mimeType, "video/") ||
|
|
strings.Contains(mimeType, "mpeg") ||
|
|
strings.Contains(mimeType, "mp3") ||
|
|
strings.Contains(mimeType, "mp4") ||
|
|
strings.Contains(mimeType, "m4a") ||
|
|
strings.Contains(mimeType, "ogg") {
|
|
audioCount++
|
|
}
|
|
}
|
|
}
|
|
// If more than half the items have audio enclosures, it's a podcast
|
|
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
|
var rss RSS
|
|
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
|
|
return nil
|
|
}
|
|
|
|
ch := rss.Channel
|
|
|
|
feed.Title = ch.Title
|
|
feed.Description = ch.Description
|
|
feed.Language = ch.Language
|
|
feed.SiteURL = normalizeURL(ch.Link)
|
|
feed.TTLMinutes = ch.TTL
|
|
feed.UpdatePeriod = ch.UpdatePeriod
|
|
feed.UpdateFreq = ch.UpdateFreq
|
|
feed.ItemCount = len(ch.Items)
|
|
|
|
// Detect podcast
|
|
if isPodcast(ch) {
|
|
feed.Category = "podcast"
|
|
}
|
|
|
|
// Parse lastBuildDate
|
|
if ch.LastBuildDate != "" {
|
|
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
|
|
feed.LastBuildDate = t
|
|
}
|
|
}
|
|
|
|
// Parse items
|
|
now := time.Now()
|
|
var items []*Item
|
|
var dates []time.Time
|
|
|
|
for _, rssItem := range ch.Items {
|
|
item := &Item{
|
|
FeedURL: feed.URL,
|
|
Title: rssItem.Title,
|
|
Link: rssItem.Link,
|
|
Description: rssItem.Description,
|
|
Content: rssItem.Content,
|
|
DiscoveredAt: now,
|
|
}
|
|
|
|
// Use GUID if available, otherwise use link
|
|
if rssItem.GUID != "" {
|
|
item.GUID = rssItem.GUID
|
|
} else if rssItem.Link != "" {
|
|
item.GUID = rssItem.Link
|
|
}
|
|
|
|
// Author: prefer author, fall back to dc:creator
|
|
if rssItem.Author != "" {
|
|
item.Author = rssItem.Author
|
|
} else if rssItem.Creator != "" {
|
|
item.Author = rssItem.Creator
|
|
}
|
|
|
|
// Parse pubDate
|
|
if rssItem.PubDate != "" {
|
|
if t, err := parseRSSDate(rssItem.PubDate); err == nil {
|
|
item.PubDate = t
|
|
dates = append(dates, t)
|
|
}
|
|
}
|
|
|
|
// Map enclosure
|
|
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
|
|
item.Enclosure = &Enclosure{
|
|
URL: rssItem.Enclosure.URL,
|
|
Type: rssItem.Enclosure.Type,
|
|
Length: rssItem.Enclosure.Length,
|
|
}
|
|
}
|
|
|
|
// Extract images from various sources
|
|
item.ImageURLs = extractItemImages(rssItem)
|
|
|
|
items = append(items, item)
|
|
}
|
|
|
|
// Calculate date stats
|
|
if len(dates) > 0 {
|
|
oldest, newest := dates[0], dates[0]
|
|
for _, d := range dates {
|
|
if d.Before(oldest) {
|
|
oldest = d
|
|
}
|
|
if d.After(newest) {
|
|
newest = d
|
|
}
|
|
}
|
|
feed.OldestItemDate = oldest
|
|
feed.NewestItemDate = newest
|
|
|
|
if len(dates) > 1 {
|
|
totalHours := newest.Sub(oldest).Hours()
|
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
|
}
|
|
}
|
|
|
|
return items
|
|
}
|
|
|
|
func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
|
|
var atom AtomFeed
|
|
if err := xml.Unmarshal([]byte(body), &atom); err != nil {
|
|
return nil
|
|
}
|
|
|
|
feed.Title = atom.Title
|
|
feed.ItemCount = len(atom.Entries)
|
|
|
|
// Get site URL from links
|
|
for _, link := range atom.Link {
|
|
if link.Rel == "" || link.Rel == "alternate" {
|
|
if link.Type == "" || strings.Contains(link.Type, "html") {
|
|
feed.SiteURL = normalizeURL(link.Href)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse updated date
|
|
if atom.Updated != "" {
|
|
if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil {
|
|
feed.LastBuildDate = t
|
|
}
|
|
}
|
|
|
|
// Parse entries
|
|
now := time.Now()
|
|
var items []*Item
|
|
var dates []time.Time
|
|
|
|
for _, entry := range atom.Entries {
|
|
item := &Item{
|
|
FeedURL: feed.URL,
|
|
Title: entry.Title,
|
|
Author: entry.Author.Name,
|
|
DiscoveredAt: now,
|
|
}
|
|
|
|
// Use ID as GUID
|
|
if entry.ID != "" {
|
|
item.GUID = entry.ID
|
|
}
|
|
|
|
// Get link (prefer alternate, fall back to first link)
|
|
for _, link := range entry.Links {
|
|
if link.Rel == "" || link.Rel == "alternate" {
|
|
item.Link = link.Href
|
|
break
|
|
}
|
|
}
|
|
if item.Link == "" && len(entry.Links) > 0 {
|
|
item.Link = entry.Links[0].Href
|
|
}
|
|
|
|
// Use ID as GUID fallback if not set
|
|
if item.GUID == "" && item.Link != "" {
|
|
item.GUID = item.Link
|
|
}
|
|
|
|
// Summary/Content
|
|
item.Description = entry.Summary
|
|
item.Content = entry.Content.Value
|
|
|
|
// Parse dates
|
|
dateStr := entry.Updated
|
|
if dateStr == "" {
|
|
dateStr = entry.Published
|
|
}
|
|
if dateStr != "" {
|
|
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
|
|
item.PubDate = t
|
|
dates = append(dates, t)
|
|
}
|
|
}
|
|
|
|
items = append(items, item)
|
|
}
|
|
|
|
// Calculate date stats
|
|
if len(dates) > 0 {
|
|
oldest, newest := dates[0], dates[0]
|
|
for _, d := range dates {
|
|
if d.Before(oldest) {
|
|
oldest = d
|
|
}
|
|
if d.After(newest) {
|
|
newest = d
|
|
}
|
|
}
|
|
feed.OldestItemDate = oldest
|
|
feed.NewestItemDate = newest
|
|
|
|
if len(dates) > 1 {
|
|
totalHours := newest.Sub(oldest).Hours()
|
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
|
}
|
|
}
|
|
|
|
return items
|
|
}
|
|
|
|
// parseRSSDate attempts to parse various RSS date formats
|
|
func parseRSSDate(s string) (time.Time, error) {
|
|
formats := []string{
|
|
time.RFC1123Z,
|
|
time.RFC1123,
|
|
time.RFC822Z,
|
|
time.RFC822,
|
|
time.RFC3339,
|
|
"Mon, 2 Jan 2006 15:04:05 -0700",
|
|
"2006-01-02T15:04:05-07:00",
|
|
"2006-01-02 15:04:05",
|
|
}
|
|
|
|
for _, format := range formats {
|
|
if t, err := time.Parse(format, s); err == nil {
|
|
return t, nil
|
|
}
|
|
}
|
|
return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
|
|
}
|
|
|
|
// calculateNextCrawl determines when to next crawl this feed
|
|
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
|
|
now := time.Now()
|
|
|
|
// If TTL is specified, use it
|
|
if feed.TTLMinutes > 0 {
|
|
return now.Add(time.Duration(feed.TTLMinutes) * time.Minute)
|
|
}
|
|
|
|
// If updatePeriod is specified
|
|
if feed.UpdatePeriod != "" {
|
|
freq := feed.UpdateFreq
|
|
if freq == 0 {
|
|
freq = 1
|
|
}
|
|
switch strings.ToLower(feed.UpdatePeriod) {
|
|
case "hourly":
|
|
return now.Add(time.Duration(freq) * time.Hour)
|
|
case "daily":
|
|
return now.Add(time.Duration(freq) * 24 * time.Hour)
|
|
case "weekly":
|
|
return now.Add(time.Duration(freq) * 7 * 24 * time.Hour)
|
|
case "monthly":
|
|
return now.Add(time.Duration(freq) * 30 * 24 * time.Hour)
|
|
case "yearly":
|
|
return now.Add(time.Duration(freq) * 365 * 24 * time.Hour)
|
|
}
|
|
}
|
|
|
|
// If we have average post frequency, use that
|
|
if feed.AvgPostFreqHrs > 0 {
|
|
// Crawl at half the average frequency, but at least every hour and at most once per day
|
|
crawlInterval := feed.AvgPostFreqHrs / 2
|
|
if crawlInterval < 1 {
|
|
crawlInterval = 1
|
|
}
|
|
if crawlInterval > 24 {
|
|
crawlInterval = 24
|
|
}
|
|
return now.Add(time.Duration(crawlInterval * float64(time.Hour)))
|
|
}
|
|
|
|
// Default: crawl every 6 hours
|
|
return now.Add(6 * time.Hour)
|
|
}
|
|
|
|
// extractItemImages extracts image URLs from an RSS item
|
|
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
|
|
func extractItemImages(rssItem RSSItem) []string {
|
|
seen := make(map[string]bool)
|
|
var images []string
|
|
|
|
addImage := func(url string) {
|
|
url = strings.TrimSpace(url)
|
|
if url == "" || seen[url] {
|
|
return
|
|
}
|
|
// Basic validation
|
|
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
|
|
return
|
|
}
|
|
seen[url] = true
|
|
images = append(images, url)
|
|
}
|
|
|
|
// 1. Media RSS content (prefer larger images)
|
|
for _, mc := range rssItem.MediaContent {
|
|
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
|
|
addImage(mc.URL)
|
|
}
|
|
}
|
|
|
|
// 2. Media RSS thumbnails
|
|
for _, mt := range rssItem.MediaThumbnail {
|
|
if mt.URL != "" {
|
|
addImage(mt.URL)
|
|
}
|
|
}
|
|
|
|
// 3. iTunes image
|
|
if rssItem.ITunesImage != "" {
|
|
addImage(rssItem.ITunesImage)
|
|
}
|
|
|
|
// 4. Image enclosure
|
|
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
|
|
addImage(rssItem.Enclosure.URL)
|
|
}
|
|
|
|
// 5. Extract <img> tags from description and content
|
|
htmlImages := extractImgTags(rssItem.Description)
|
|
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
|
|
for _, img := range htmlImages {
|
|
addImage(img)
|
|
}
|
|
|
|
return images
|
|
}
|
|
|
|
// extractImgTags extracts src URLs from <img> tags in HTML
|
|
func extractImgTags(html string) []string {
|
|
if html == "" {
|
|
return nil
|
|
}
|
|
|
|
var urls []string
|
|
|
|
// Simple regex to find img src attributes
|
|
// Matches: src="..." or src='...'
|
|
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
|
|
matches := imgRegex.FindAllStringSubmatch(html, -1)
|
|
|
|
for _, match := range matches {
|
|
if len(match) > 1 {
|
|
url := strings.TrimSpace(match[1])
|
|
// Skip data URIs, tracking pixels, and tiny images
|
|
if strings.HasPrefix(url, "data:") {
|
|
continue
|
|
}
|
|
// Skip common tracking/spacer images
|
|
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
|
|
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
|
|
continue
|
|
}
|
|
urls = append(urls, url)
|
|
}
|
|
}
|
|
|
|
return urls
|
|
}
|
|
|
|
// JSON Feed structs (https://jsonfeed.org/version/1.1)
|
|
type JSONFeed struct {
|
|
Version string `json:"version"`
|
|
Title string `json:"title"`
|
|
HomePageURL string `json:"home_page_url"`
|
|
FeedURL string `json:"feed_url"`
|
|
Description string `json:"description"`
|
|
Language string `json:"language"`
|
|
Items []JSONFeedItem `json:"items"`
|
|
}
|
|
|
|
type JSONFeedItem struct {
|
|
ID string `json:"id"`
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
ContentHTML string `json:"content_html"`
|
|
ContentText string `json:"content_text"`
|
|
Summary string `json:"summary"`
|
|
Image string `json:"image"`
|
|
DatePublished string `json:"date_published"`
|
|
DateModified string `json:"date_modified"`
|
|
Authors []JSONFeedAuthor `json:"authors"`
|
|
Attachments []JSONFeedAttachment `json:"attachments"`
|
|
}
|
|
|
|
type JSONFeedAuthor struct {
|
|
Name string `json:"name"`
|
|
URL string `json:"url"`
|
|
}
|
|
|
|
type JSONFeedAttachment struct {
|
|
URL string `json:"url"`
|
|
MimeType string `json:"mime_type"`
|
|
Size int64 `json:"size_in_bytes"`
|
|
}
|
|
|
|
func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
|
|
var jf JSONFeed
|
|
if err := json.Unmarshal([]byte(body), &jf); err != nil {
|
|
return nil
|
|
}
|
|
|
|
feed.Title = jf.Title
|
|
feed.Description = jf.Description
|
|
feed.Language = jf.Language
|
|
feed.SiteURL = normalizeURL(jf.HomePageURL)
|
|
feed.ItemCount = len(jf.Items)
|
|
|
|
// Parse items
|
|
now := time.Now()
|
|
var items []*Item
|
|
var dates []time.Time
|
|
|
|
for _, ji := range jf.Items {
|
|
item := &Item{
|
|
FeedURL: feed.URL,
|
|
Title: ji.Title,
|
|
Link: ji.URL,
|
|
DiscoveredAt: now,
|
|
}
|
|
|
|
// Use ID as GUID, fall back to URL
|
|
if ji.ID != "" {
|
|
item.GUID = ji.ID
|
|
} else if ji.URL != "" {
|
|
item.GUID = ji.URL
|
|
}
|
|
|
|
// Content: prefer HTML, fall back to text
|
|
if ji.ContentHTML != "" {
|
|
item.Content = ji.ContentHTML
|
|
} else if ji.ContentText != "" {
|
|
item.Content = ji.ContentText
|
|
}
|
|
item.Description = ji.Summary
|
|
|
|
// Author
|
|
if len(ji.Authors) > 0 {
|
|
item.Author = ji.Authors[0].Name
|
|
}
|
|
|
|
// Parse date
|
|
dateStr := ji.DatePublished
|
|
if dateStr == "" {
|
|
dateStr = ji.DateModified
|
|
}
|
|
if dateStr != "" {
|
|
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
|
|
item.PubDate = t
|
|
dates = append(dates, t)
|
|
}
|
|
}
|
|
|
|
// Images
|
|
if ji.Image != "" {
|
|
item.ImageURLs = []string{ji.Image}
|
|
}
|
|
|
|
// Attachments (enclosures)
|
|
for _, att := range ji.Attachments {
|
|
if att.URL != "" {
|
|
item.Enclosure = &Enclosure{
|
|
URL: att.URL,
|
|
Type: att.MimeType,
|
|
Length: att.Size,
|
|
}
|
|
break // Only use first attachment as enclosure
|
|
}
|
|
}
|
|
|
|
items = append(items, item)
|
|
}
|
|
|
|
// Calculate date stats
|
|
if len(dates) > 0 {
|
|
oldest, newest := dates[0], dates[0]
|
|
for _, d := range dates {
|
|
if d.Before(oldest) {
|
|
oldest = d
|
|
}
|
|
if d.After(newest) {
|
|
newest = d
|
|
}
|
|
}
|
|
feed.OldestItemDate = oldest
|
|
feed.NewestItemDate = newest
|
|
|
|
if len(dates) > 1 {
|
|
totalHours := newest.Sub(oldest).Hours()
|
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
|
}
|
|
}
|
|
|
|
return items
|
|
}
|