Add JSON Feed support
- Detect JSON Feed format (jsonfeed.org) via version field - Parse JSON Feed metadata and items - Support application/feed+json MIME type for feed discovery - Include "json" as valid feed type (not auto-denied) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -178,12 +178,12 @@ type Feed struct {
|
|||||||
// saveFeed stores a feed in PostgreSQL
|
// saveFeed stores a feed in PostgreSQL
|
||||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||||
// Default publishStatus to "held" if not set
|
// Default publishStatus to "held" if not set
|
||||||
// Auto-deny feeds with no language or non-RSS/Atom type
|
// Auto-deny feeds with no language or unsupported type
|
||||||
publishStatus := feed.PublishStatus
|
publishStatus := feed.PublishStatus
|
||||||
if publishStatus == "" {
|
if publishStatus == "" {
|
||||||
if feed.Language == "" {
|
if feed.Language == "" {
|
||||||
publishStatus = "deny"
|
publishStatus = "deny"
|
||||||
} else if feed.Type != "rss" && feed.Type != "atom" {
|
} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
|
||||||
publishStatus = "deny"
|
publishStatus = "deny"
|
||||||
} else {
|
} else {
|
||||||
publishStatus = "held"
|
publishStatus = "held"
|
||||||
@@ -779,6 +779,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
|||||||
items = c.parseRSSMetadata(body, feed)
|
items = c.parseRSSMetadata(body, feed)
|
||||||
case "atom":
|
case "atom":
|
||||||
items = c.parseAtomMetadata(body, feed)
|
items = c.parseAtomMetadata(body, feed)
|
||||||
|
case "json":
|
||||||
|
items = c.parseJSONFeedMetadata(body, feed)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refine category based on parsed title (e.g., "Comments on:")
|
// Refine category based on parsed title (e.g., "Comments on:")
|
||||||
@@ -951,6 +953,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|||||||
items = c.parseRSSMetadata(body, feed)
|
items = c.parseRSSMetadata(body, feed)
|
||||||
case "atom":
|
case "atom":
|
||||||
items = c.parseAtomMetadata(body, feed)
|
items = c.parseAtomMetadata(body, feed)
|
||||||
|
case "json":
|
||||||
|
items = c.parseJSONFeedMetadata(body, feed)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Content changed - reset backoff
|
// Content changed - reset backoff
|
||||||
|
|||||||
@@ -17,7 +17,9 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
|
|||||||
if strings.Contains(contentType, "application/rss+xml") ||
|
if strings.Contains(contentType, "application/rss+xml") ||
|
||||||
strings.Contains(contentType, "application/atom+xml") ||
|
strings.Contains(contentType, "application/atom+xml") ||
|
||||||
strings.Contains(contentType, "application/xml") ||
|
strings.Contains(contentType, "application/xml") ||
|
||||||
strings.Contains(contentType, "text/xml") {
|
strings.Contains(contentType, "text/xml") ||
|
||||||
|
strings.Contains(contentType, "application/feed+json") ||
|
||||||
|
strings.Contains(contentType, "application/json") {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,6 +29,10 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Check for JSON Feed
|
||||||
|
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -37,6 +43,11 @@ func (c *Crawler) detectFeedType(body string) string {
|
|||||||
if strings.Contains(body, "<feed") {
|
if strings.Contains(body, "<feed") {
|
||||||
return "atom"
|
return "atom"
|
||||||
}
|
}
|
||||||
|
// Check for JSON Feed (version field contains jsonfeed.org URL)
|
||||||
|
body = strings.TrimSpace(body)
|
||||||
|
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
|
||||||
|
return "json"
|
||||||
|
}
|
||||||
return "unknown"
|
return "unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,11 +69,13 @@ func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
|
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml" || typeAttr == "application/feed+json") {
|
||||||
absURL := makeAbsoluteURL(href, baseURL)
|
absURL := makeAbsoluteURL(href, baseURL)
|
||||||
feedType := "rss"
|
feedType := "rss"
|
||||||
if typeAttr == "application/atom+xml" {
|
if typeAttr == "application/atom+xml" {
|
||||||
feedType = "atom"
|
feedType = "atom"
|
||||||
|
} else if typeAttr == "application/feed+json" {
|
||||||
|
feedType = "json"
|
||||||
}
|
}
|
||||||
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
|
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"encoding/xml"
|
"encoding/xml"
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -500,3 +501,139 @@ func extractImgTags(html string) []string {
|
|||||||
|
|
||||||
return urls
|
return urls
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// JSON Feed structs (https://jsonfeed.org/version/1.1)
|
||||||
|
type JSONFeed struct {
|
||||||
|
Version string `json:"version"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
HomePageURL string `json:"home_page_url"`
|
||||||
|
FeedURL string `json:"feed_url"`
|
||||||
|
Description string `json:"description"`
|
||||||
|
Language string `json:"language"`
|
||||||
|
Items []JSONFeedItem `json:"items"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JSONFeedItem struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
ContentHTML string `json:"content_html"`
|
||||||
|
ContentText string `json:"content_text"`
|
||||||
|
Summary string `json:"summary"`
|
||||||
|
Image string `json:"image"`
|
||||||
|
DatePublished string `json:"date_published"`
|
||||||
|
DateModified string `json:"date_modified"`
|
||||||
|
Authors []JSONFeedAuthor `json:"authors"`
|
||||||
|
Attachments []JSONFeedAttachment `json:"attachments"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JSONFeedAuthor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JSONFeedAttachment struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
MimeType string `json:"mime_type"`
|
||||||
|
Size int64 `json:"size_in_bytes"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
|
||||||
|
var jf JSONFeed
|
||||||
|
if err := json.Unmarshal([]byte(body), &jf); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
feed.Title = jf.Title
|
||||||
|
feed.Description = jf.Description
|
||||||
|
feed.Language = jf.Language
|
||||||
|
feed.SiteURL = normalizeURL(jf.HomePageURL)
|
||||||
|
feed.ItemCount = len(jf.Items)
|
||||||
|
|
||||||
|
// Parse items
|
||||||
|
now := time.Now()
|
||||||
|
var items []*Item
|
||||||
|
var dates []time.Time
|
||||||
|
|
||||||
|
for _, ji := range jf.Items {
|
||||||
|
item := &Item{
|
||||||
|
FeedURL: feed.URL,
|
||||||
|
Title: ji.Title,
|
||||||
|
Link: ji.URL,
|
||||||
|
DiscoveredAt: now,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ID as GUID, fall back to URL
|
||||||
|
if ji.ID != "" {
|
||||||
|
item.GUID = ji.ID
|
||||||
|
} else if ji.URL != "" {
|
||||||
|
item.GUID = ji.URL
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content: prefer HTML, fall back to text
|
||||||
|
if ji.ContentHTML != "" {
|
||||||
|
item.Content = ji.ContentHTML
|
||||||
|
} else if ji.ContentText != "" {
|
||||||
|
item.Content = ji.ContentText
|
||||||
|
}
|
||||||
|
item.Description = ji.Summary
|
||||||
|
|
||||||
|
// Author
|
||||||
|
if len(ji.Authors) > 0 {
|
||||||
|
item.Author = ji.Authors[0].Name
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse date
|
||||||
|
dateStr := ji.DatePublished
|
||||||
|
if dateStr == "" {
|
||||||
|
dateStr = ji.DateModified
|
||||||
|
}
|
||||||
|
if dateStr != "" {
|
||||||
|
if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
|
||||||
|
item.PubDate = t
|
||||||
|
dates = append(dates, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Images
|
||||||
|
if ji.Image != "" {
|
||||||
|
item.ImageURLs = []string{ji.Image}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attachments (enclosures)
|
||||||
|
for _, att := range ji.Attachments {
|
||||||
|
if att.URL != "" {
|
||||||
|
item.Enclosure = &Enclosure{
|
||||||
|
URL: att.URL,
|
||||||
|
Type: att.MimeType,
|
||||||
|
Length: att.Size,
|
||||||
|
}
|
||||||
|
break // Only use first attachment as enclosure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
items = append(items, item)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate date stats
|
||||||
|
if len(dates) > 0 {
|
||||||
|
oldest, newest := dates[0], dates[0]
|
||||||
|
for _, d := range dates {
|
||||||
|
if d.Before(oldest) {
|
||||||
|
oldest = d
|
||||||
|
}
|
||||||
|
if d.After(newest) {
|
||||||
|
newest = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
feed.OldestItemDate = oldest
|
||||||
|
feed.NewestItemDate = newest
|
||||||
|
|
||||||
|
if len(dates) > 1 {
|
||||||
|
totalHours := newest.Sub(oldest).Hours()
|
||||||
|
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return items
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user