diff --git a/feed.go b/feed.go index d4c2e46..06ee93a 100644 --- a/feed.go +++ b/feed.go @@ -178,12 +178,12 @@ type Feed struct { // saveFeed stores a feed in PostgreSQL func (c *Crawler) saveFeed(feed *Feed) error { // Default publishStatus to "held" if not set - // Auto-deny feeds with no language or non-RSS/Atom type + // Auto-deny feeds with no language or unsupported type publishStatus := feed.PublishStatus if publishStatus == "" { if feed.Language == "" { publishStatus = "deny" - } else if feed.Type != "rss" && feed.Type != "atom" { + } else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" { publishStatus = "deny" } else { publishStatus = "held" @@ -779,6 +779,8 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea items = c.parseRSSMetadata(body, feed) case "atom": items = c.parseAtomMetadata(body, feed) + case "json": + items = c.parseJSONFeedMetadata(body, feed) } // Refine category based on parsed title (e.g., "Comments on:") @@ -951,6 +953,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) { items = c.parseRSSMetadata(body, feed) case "atom": items = c.parseAtomMetadata(body, feed) + case "json": + items = c.parseJSONFeedMetadata(body, feed) } // Content changed - reset backoff diff --git a/html.go b/html.go index 2fbb0d2..8ca498d 100644 --- a/html.go +++ b/html.go @@ -17,7 +17,9 @@ func (c *Crawler) isFeedContent(body, contentType string) bool { if strings.Contains(contentType, "application/rss+xml") || strings.Contains(contentType, "application/atom+xml") || strings.Contains(contentType, "application/xml") || - strings.Contains(contentType, "text/xml") { + strings.Contains(contentType, "text/xml") || + strings.Contains(contentType, "application/feed+json") || + strings.Contains(contentType, "application/json") { return true } @@ -27,6 +29,10 @@ func (c *Crawler) isFeedContent(body, contentType string) bool { return true } } + // Check for JSON Feed + if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") { + return true + } return false } @@ -37,6 +43,11 @@ func (c *Crawler) detectFeedType(body string) string { if strings.Contains(body, " 0 { + item.Author = ji.Authors[0].Name + } + + // Parse date + dateStr := ji.DatePublished + if dateStr == "" { + dateStr = ji.DateModified + } + if dateStr != "" { + if t, err := time.Parse(time.RFC3339, dateStr); err == nil { + item.PubDate = t + dates = append(dates, t) + } + } + + // Images + if ji.Image != "" { + item.ImageURLs = []string{ji.Image} + } + + // Attachments (enclosures) + for _, att := range ji.Attachments { + if att.URL != "" { + item.Enclosure = &Enclosure{ + URL: att.URL, + Type: att.MimeType, + Length: att.Size, + } + break // Only use first attachment as enclosure + } + } + + items = append(items, item) + } + + // Calculate date stats + if len(dates) > 0 { + oldest, newest := dates[0], dates[0] + for _, d := range dates { + if d.Before(oldest) { + oldest = d + } + if d.After(newest) { + newest = d + } + } + feed.OldestItemDate = oldest + feed.NewestItemDate = newest + + if len(dates) > 1 { + totalHours := newest.Sub(oldest).Hours() + feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1) + } + } + + return items +}