Files
crawler/html.go
primal ad78c1a4c0 Add JSON Feed support
- Detect JSON Feed format (jsonfeed.org) via version field
- Parse JSON Feed metadata and items
- Support application/feed+json MIME type for feed discovery
- Include "json" as valid feed type (not auto-denied)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 13:16:50 -05:00

140 lines
3.6 KiB
Go

package main
import (
"regexp"
"strings"
"golang.org/x/net/html"
)
// simpleFeed is a lightweight feed reference used during HTML extraction
type simpleFeed struct {
URL string
Type string
}
func (c *Crawler) isFeedContent(body, contentType string) bool {
if strings.Contains(contentType, "application/rss+xml") ||
strings.Contains(contentType, "application/atom+xml") ||
strings.Contains(contentType, "application/xml") ||
strings.Contains(contentType, "text/xml") ||
strings.Contains(contentType, "application/feed+json") ||
strings.Contains(contentType, "application/json") {
return true
}
body = strings.TrimSpace(body)
if strings.HasPrefix(body, "<?xml") {
if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
return true
}
}
// Check for JSON Feed
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
return true
}
return false
}
func (c *Crawler) detectFeedType(body string) string {
if strings.Contains(body, "<rss") {
return "rss"
}
if strings.Contains(body, "<feed") {
return "atom"
}
// Check for JSON Feed (version field contains jsonfeed.org URL)
body = strings.TrimSpace(body)
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
return "json"
}
return "unknown"
}
func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
feeds := make([]simpleFeed, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "link" {
var rel, href, typeAttr string
for _, attr := range n.Attr {
switch attr.Key {
case "rel":
rel = attr.Val
case "href":
href = attr.Val
case "type":
typeAttr = attr.Val
}
}
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml" || typeAttr == "application/feed+json") {
absURL := makeAbsoluteURL(href, baseURL)
feedType := "rss"
if typeAttr == "application/atom+xml" {
feedType = "atom"
} else if typeAttr == "application/feed+json" {
feedType = "json"
}
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return feeds
}
func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []simpleFeed {
feeds := make([]simpleFeed, 0)
// Match feed URLs more precisely:
// - /feed, /rss, /atom as path segments (not "feeds" or "feedback")
// - .rss, .atom, .xml file extensions
// - ?feed=, ?format=rss, etc.
feedPattern := regexp.MustCompile(`(?i)(/feed/?$|/feed/|/rss/?$|/rss/|/atom/?$|/atom/|\.rss|\.atom|\.xml|\?.*feed=|\?.*format=rss|\?.*format=atom)`)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
href := attr.Val
if feedPattern.MatchString(href) {
absURL := makeAbsoluteURL(href, baseURL)
feeds = append(feeds, simpleFeed{URL: absURL, Type: "unknown"})
}
}
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return feeds
}
func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
links := make([]string, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
link := makeAbsoluteURL(attr.Val, baseURL)
links = append(links, link)
}
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return links
}