- Detect JSON Feed format (jsonfeed.org) via version field - Parse JSON Feed metadata and items - Support application/feed+json MIME type for feed discovery - Include "json" as valid feed type (not auto-denied) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
140 lines
3.6 KiB
Go
140 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// simpleFeed is a lightweight feed reference used during HTML extraction
|
|
type simpleFeed struct {
|
|
URL string
|
|
Type string
|
|
}
|
|
|
|
func (c *Crawler) isFeedContent(body, contentType string) bool {
|
|
if strings.Contains(contentType, "application/rss+xml") ||
|
|
strings.Contains(contentType, "application/atom+xml") ||
|
|
strings.Contains(contentType, "application/xml") ||
|
|
strings.Contains(contentType, "text/xml") ||
|
|
strings.Contains(contentType, "application/feed+json") ||
|
|
strings.Contains(contentType, "application/json") {
|
|
return true
|
|
}
|
|
|
|
body = strings.TrimSpace(body)
|
|
if strings.HasPrefix(body, "<?xml") {
|
|
if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
|
|
return true
|
|
}
|
|
}
|
|
// Check for JSON Feed
|
|
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (c *Crawler) detectFeedType(body string) string {
|
|
if strings.Contains(body, "<rss") {
|
|
return "rss"
|
|
}
|
|
if strings.Contains(body, "<feed") {
|
|
return "atom"
|
|
}
|
|
// Check for JSON Feed (version field contains jsonfeed.org URL)
|
|
body = strings.TrimSpace(body)
|
|
if strings.HasPrefix(body, "{") && strings.Contains(body, "jsonfeed.org") {
|
|
return "json"
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
|
|
feeds := make([]simpleFeed, 0)
|
|
var f func(*html.Node)
|
|
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "link" {
|
|
var rel, href, typeAttr string
|
|
for _, attr := range n.Attr {
|
|
switch attr.Key {
|
|
case "rel":
|
|
rel = attr.Val
|
|
case "href":
|
|
href = attr.Val
|
|
case "type":
|
|
typeAttr = attr.Val
|
|
}
|
|
}
|
|
|
|
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml" || typeAttr == "application/feed+json") {
|
|
absURL := makeAbsoluteURL(href, baseURL)
|
|
feedType := "rss"
|
|
if typeAttr == "application/atom+xml" {
|
|
feedType = "atom"
|
|
} else if typeAttr == "application/feed+json" {
|
|
feedType = "json"
|
|
}
|
|
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
|
|
}
|
|
}
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
f(child)
|
|
}
|
|
}
|
|
f(n)
|
|
return feeds
|
|
}
|
|
|
|
func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []simpleFeed {
|
|
feeds := make([]simpleFeed, 0)
|
|
// Match feed URLs more precisely:
|
|
// - /feed, /rss, /atom as path segments (not "feeds" or "feedback")
|
|
// - .rss, .atom, .xml file extensions
|
|
// - ?feed=, ?format=rss, etc.
|
|
feedPattern := regexp.MustCompile(`(?i)(/feed/?$|/feed/|/rss/?$|/rss/|/atom/?$|/atom/|\.rss|\.atom|\.xml|\?.*feed=|\?.*format=rss|\?.*format=atom)`)
|
|
|
|
var f func(*html.Node)
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "a" {
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "href" {
|
|
href := attr.Val
|
|
if feedPattern.MatchString(href) {
|
|
absURL := makeAbsoluteURL(href, baseURL)
|
|
feeds = append(feeds, simpleFeed{URL: absURL, Type: "unknown"})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
f(child)
|
|
}
|
|
}
|
|
f(n)
|
|
return feeds
|
|
}
|
|
|
|
func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
|
|
links := make([]string, 0)
|
|
var f func(*html.Node)
|
|
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "a" {
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "href" {
|
|
link := makeAbsoluteURL(attr.Val, baseURL)
|
|
links = append(links, link)
|
|
}
|
|
}
|
|
}
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
f(child)
|
|
}
|
|
}
|
|
f(n)
|
|
return links
|
|
}
|