Add JSON Feed support

- Detect JSON Feed format (jsonfeed.org) via version field - Parse JSON Feed metadata and items - Support application/feed+json MIME type for feed discovery - Include "json" as valid feed type (not auto-denied) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 13:16:50 -05:00
parent 798f79bfe9
commit ad78c1a4c0
3 changed files with 158 additions and 4 deletions
@@ -178,12 +178,12 @@ type Feed struct {
 // saveFeed stores a feed in PostgreSQL
 func (c *Crawler) saveFeed(feed *Feed) error {
 	// Default publishStatus to "held" if not set
-	// Auto-deny feeds with no language or non-RSS/Atom type
+	// Auto-deny feeds with no language or unsupported type
 	publishStatus := feed.PublishStatus
 	if publishStatus == "" {
 		if feed.Language == "" {
 			publishStatus = "deny"
-		} else if feed.Type != "rss" && feed.Type != "atom" {
+		} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
 			publishStatus = "deny"
 		} else {
 			publishStatus = "held"
@@ -779,6 +779,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
 		items = c.parseRSSMetadata(body, feed)
 	case "atom":
 		items = c.parseAtomMetadata(body, feed)
+	case "json":
+		items = c.parseJSONFeedMetadata(body, feed)
 	}

 	// Refine category based on parsed title (e.g., "Comments on:")
@@ -951,6 +953,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
 		items = c.parseRSSMetadata(body, feed)
 	case "atom":
 		items = c.parseAtomMetadata(body, feed)
+	case "json":
+		items = c.parseJSONFeedMetadata(body, feed)
 	}

 	// Content changed - reset backoff
@@ -17,7 +17,9 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
 	if strings.Contains(contentType, "application/rss+xml") ||
 		strings.Contains(contentType, "application/atom+xml") ||
 		strings.Contains(contentType, "application/xml") ||
-		strings.Contains(contentType, "text/xml") {
+		strings.Contains(contentType, "text/xml") ||
+		strings.Contains(contentType, "application/feed+json") ||
+		strings.Contains(contentType, "application/json") {
 		return true
 	}

@@ -27,6 +29,10 @@ func (c *Crawler) isFeedContent(body, contentType string) bool {
 			return true
 		}
 	}
+	// Check for JSON Feed
+	if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
+		return true
+	}
 	return false
 }

@@ -37,6 +43,11 @@ func (c *Crawler) detectFeedType(body string) string {
 	if strings.Contains(body, "<feed") {
 		return "atom"
 	}
+	// Check for JSON Feed (version field contains jsonfeed.org URL)
+	body = strings.TrimSpace(body)
+	if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
+		return "json"
+	}
 	return "unknown"
 }

@@ -58,11 +69,13 @@ func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
 				}
 			}

-			if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
+			if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml" || typeAttr == "application/feed+json") {
 				absURL := makeAbsoluteURL(href, baseURL)
 				feedType := "rss"
 				if typeAttr == "application/atom+xml" {
 					feedType = "atom"
+				} else if typeAttr == "application/feed+json" {
+					feedType = "json"
 				}
 				feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
 			}
@@ -1,6 +1,7 @@
 package main

 import (
+	"encoding/json"
 	"encoding/xml"
 	"fmt"
 	"regexp"
@@ -500,3 +501,139 @@ func extractImgTags(html string) []string {

 	return urls
 }
+
+// JSON Feed structs (https://jsonfeed.org/version/1.1)
+type JSONFeed struct {
+	Version     string         `json:"version"`
+	Title       string         `json:"title"`
+	HomePageURL string         `json:"home_page_url"`
+	FeedURL     string         `json:"feed_url"`
+	Description string         `json:"description"`
+	Language    string         `json:"language"`
+	Items       []JSONFeedItem `json:"items"`
+}
+
+type JSONFeedItem struct {
+	ID            string           `json:"id"`
+	URL           string           `json:"url"`
+	Title         string           `json:"title"`
+	ContentHTML   string           `json:"content_html"`
+	ContentText   string           `json:"content_text"`
+	Summary       string           `json:"summary"`
+	Image         string           `json:"image"`
+	DatePublished string           `json:"date_published"`
+	DateModified  string           `json:"date_modified"`
+	Authors       []JSONFeedAuthor `json:"authors"`
+	Attachments   []JSONFeedAttachment `json:"attachments"`
+}
+
+type JSONFeedAuthor struct {
+	Name string `json:"name"`
+	URL  string `json:"url"`
+}
+
+type JSONFeedAttachment struct {
+	URL      string `json:"url"`
+	MimeType string `json:"mime_type"`
+	Size     int64  `json:"size_in_bytes"`
+}
+
+func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
+	var jf JSONFeed
+	if err := json.Unmarshal([]byte(body), &jf); err != nil {
+		return nil
+	}
+
+	feed.Title = jf.Title
+	feed.Description = jf.Description
+	feed.Language = jf.Language
+	feed.SiteURL = normalizeURL(jf.HomePageURL)
+	feed.ItemCount = len(jf.Items)
+
+	// Parse items
+	now := time.Now()
+	var items []*Item
+	var dates []time.Time
+
+	for _, ji := range jf.Items {
+		item := &Item{
+			FeedURL:      feed.URL,
+			Title:        ji.Title,
+			Link:         ji.URL,
+			DiscoveredAt: now,
+		}
+
+		// Use ID as GUID, fall back to URL
+		if ji.ID != "" {
+			item.GUID = ji.ID
+		} else if ji.URL != "" {
+			item.GUID = ji.URL
+		}
+
+		// Content: prefer HTML, fall back to text
+		if ji.ContentHTML != "" {
+			item.Content = ji.ContentHTML
+		} else if ji.ContentText != "" {
+			item.Content = ji.ContentText
+		}
+		item.Description = ji.Summary
+
+		// Author
+		if len(ji.Authors) > 0 {
+			item.Author = ji.Authors[0].Name
+		}
+
+		// Parse date
+		dateStr := ji.DatePublished
+		if dateStr == "" {
+			dateStr = ji.DateModified
+		}
+		if dateStr != "" {
+			if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
+				item.PubDate = t
+				dates = append(dates, t)
+			}
+		}
+
+		// Images
+		if ji.Image != "" {
+			item.ImageURLs = []string{ji.Image}
+		}
+
+		// Attachments (enclosures)
+		for _, att := range ji.Attachments {
+			if att.URL != "" {
+				item.Enclosure = &Enclosure{
+					URL:    att.URL,
+					Type:   att.MimeType,
+					Length: att.Size,
+				}
+				break // Only use first attachment as enclosure
+			}
+		}
+
+		items = append(items, item)
+	}
+
+	// Calculate date stats
+	if len(dates) > 0 {
+		oldest, newest := dates[0], dates[0]
+		for _, d := range dates {
+			if d.Before(oldest) {
+				oldest = d
+			}
+			if d.After(newest) {
+				newest = d
+			}
+		}
+		feed.OldestItemDate = oldest
+		feed.NewestItemDate = newest
+
+		if len(dates) > 1 {
+			totalHours := newest.Sub(oldest).Hours()
+			feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
+		}
+	}
+
+	return items
+}