Add JSON Feed support

- Detect JSON Feed format (jsonfeed.org) via version field
- Parse JSON Feed metadata and items
- Support application/feed+json MIME type for feed discovery
- Include "json" as valid feed type (not auto-denied)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-29 13:16:50 -05:00
parent 798f79bfe9
commit ad78c1a4c0
3 changed files with 158 additions and 4 deletions
+6 -2
View File
@@ -178,12 +178,12 @@ type Feed struct {
// saveFeed stores a feed in PostgreSQL // saveFeed stores a feed in PostgreSQL
func (c *Crawler) saveFeed(feed *Feed) error { func (c *Crawler) saveFeed(feed *Feed) error {
// Default publishStatus to "held" if not set // Default publishStatus to "held" if not set
// Auto-deny feeds with no language or non-RSS/Atom type // Auto-deny feeds with no language or unsupported type
publishStatus := feed.PublishStatus publishStatus := feed.PublishStatus
if publishStatus == "" { if publishStatus == "" {
if feed.Language == "" { if feed.Language == "" {
publishStatus = "deny" publishStatus = "deny"
} else if feed.Type != "rss" && feed.Type != "atom" { } else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
publishStatus = "deny" publishStatus = "deny"
} else { } else {
publishStatus = "held" publishStatus = "held"
@@ -779,6 +779,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
items = c.parseRSSMetadata(body, feed) items = c.parseRSSMetadata(body, feed)
case "atom": case "atom":
items = c.parseAtomMetadata(body, feed) items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
} }
// Refine category based on parsed title (e.g., "Comments on:") // Refine category based on parsed title (e.g., "Comments on:")
@@ -951,6 +953,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
items = c.parseRSSMetadata(body, feed) items = c.parseRSSMetadata(body, feed)
case "atom": case "atom":
items = c.parseAtomMetadata(body, feed) items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
} }
// Content changed - reset backoff // Content changed - reset backoff
+15 -2
View File
@@ -17,7 +17,9 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
if strings.Contains(contentType, "application/rss+xml") || if strings.Contains(contentType, "application/rss+xml") ||
strings.Contains(contentType, "application/atom+xml") || strings.Contains(contentType, "application/atom+xml") ||
strings.Contains(contentType, "application/xml") || strings.Contains(contentType, "application/xml") ||
strings.Contains(contentType, "text/xml") { strings.Contains(contentType, "text/xml") ||
strings.Contains(contentType, "application/feed+json") ||
strings.Contains(contentType, "application/json") {
return true return true
} }
@@ -27,6 +29,10 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
return true return true
} }
} }
// Check for JSON Feed
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
return true
}
return false return false
} }
@@ -37,6 +43,11 @@ func (c *Crawler) detectFeedType(body string) string {
if strings.Contains(body, "<feed") { if strings.Contains(body, "<feed") {
return "atom" return "atom"
} }
// Check for JSON Feed (version field contains jsonfeed.org URL)
body = strings.TrimSpace(body)
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
return "json"
}
return "unknown" return "unknown"
} }
@@ -58,11 +69,13 @@ func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
} }
} }
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") { if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml" || typeAttr == "application/feed+json") {
absURL := makeAbsoluteURL(href, baseURL) absURL := makeAbsoluteURL(href, baseURL)
feedType := "rss" feedType := "rss"
if typeAttr == "application/atom+xml" { if typeAttr == "application/atom+xml" {
feedType = "atom" feedType = "atom"
} else if typeAttr == "application/feed+json" {
feedType = "json"
} }
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType}) feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
} }
+137
View File
@@ -1,6 +1,7 @@
package main package main
import ( import (
"encoding/json"
"encoding/xml" "encoding/xml"
"fmt" "fmt"
"regexp" "regexp"
@@ -500,3 +501,139 @@ func extractImgTags(html string) []string {
return urls return urls
} }
// JSON Feed structs (https://jsonfeed.org/version/1.1)
type JSONFeed struct {
Version string `json:"version"`
Title string `json:"title"`
HomePageURL string `json:"home_page_url"`
FeedURL string `json:"feed_url"`
Description string `json:"description"`
Language string `json:"language"`
Items []JSONFeedItem `json:"items"`
}
type JSONFeedItem struct {
ID string `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
ContentHTML string `json:"content_html"`
ContentText string `json:"content_text"`
Summary string `json:"summary"`
Image string `json:"image"`
DatePublished string `json:"date_published"`
DateModified string `json:"date_modified"`
Authors []JSONFeedAuthor `json:"authors"`
Attachments []JSONFeedAttachment `json:"attachments"`
}
type JSONFeedAuthor struct {
Name string `json:"name"`
URL string `json:"url"`
}
type JSONFeedAttachment struct {
URL string `json:"url"`
MimeType string `json:"mime_type"`
Size int64 `json:"size_in_bytes"`
}
func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
var jf JSONFeed
if err := json.Unmarshal([]byte(body), &jf); err != nil {
return nil
}
feed.Title = jf.Title
feed.Description = jf.Description
feed.Language = jf.Language
feed.SiteURL = normalizeURL(jf.HomePageURL)
feed.ItemCount = len(jf.Items)
// Parse items
now := time.Now()
var items []*Item
var dates []time.Time
for _, ji := range jf.Items {
item := &Item{
FeedURL: feed.URL,
Title: ji.Title,
Link: ji.URL,
DiscoveredAt: now,
}
// Use ID as GUID, fall back to URL
if ji.ID != "" {
item.GUID = ji.ID
} else if ji.URL != "" {
item.GUID = ji.URL
}
// Content: prefer HTML, fall back to text
if ji.ContentHTML != "" {
item.Content = ji.ContentHTML
} else if ji.ContentText != "" {
item.Content = ji.ContentText
}
item.Description = ji.Summary
// Author
if len(ji.Authors) > 0 {
item.Author = ji.Authors[0].Name
}
// Parse date
dateStr := ji.DatePublished
if dateStr == "" {
dateStr = ji.DateModified
}
if dateStr != "" {
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
item.PubDate = t
dates = append(dates, t)
}
}
// Images
if ji.Image != "" {
item.ImageURLs = []string{ji.Image}
}
// Attachments (enclosures)
for _, att := range ji.Attachments {
if att.URL != "" {
item.Enclosure = &Enclosure{
URL: att.URL,
Type: att.MimeType,
Length: att.Size,
}
break // Only use first attachment as enclosure
}
}
items = append(items, item)
}
// Calculate date stats
if len(dates) > 0 {
oldest, newest := dates[0], dates[0]
for _, d := range dates {
if d.Before(oldest) {
oldest = d
}
if d.After(newest) {
newest = d
}
}
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
}