package main import ( "encoding/json" "encoding/xml" "fmt" "regexp" "strings" "time" ) // RSS structs for parsing type RSS struct { Channel RSSChannel `xml:"channel"` } type RSSChannel struct { Title string `xml:"title"` Link string `xml:"link"` Description string `xml:"description"` Language string `xml:"language"` LastBuildDate string `xml:"lastBuildDate"` PubDate string `xml:"pubDate"` TTL int `xml:"ttl"` UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"` UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"` Items []RSSItem `xml:"item"` // iTunes podcast namespace ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"` ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"` ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"` } type RSSItem struct { Title string `xml:"title"` Link string `xml:"link"` GUID string `xml:"guid"` Description string `xml:"description"` Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"` Author string `xml:"author"` Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"` PubDate string `xml:"pubDate"` Categories []string `xml:"category"` Enclosure *RSSEnclosure `xml:"enclosure"` // iTunes item elements ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"` ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"` ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"` // Media RSS elements MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"` MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"` } // MediaContent represents a media:content element type MediaContent struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` Medium string `xml:"medium,attr"` // image, video, audio Width int `xml:"width,attr"` Height int `xml:"height,attr"` } // MediaThumbnail represents a media:thumbnail element type MediaThumbnail struct { URL string `xml:"url,attr"` Width int `xml:"width,attr"` Height int `xml:"height,attr"` } type RSSEnclosure struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` Length int64 `xml:"length,attr"` } // Atom structs for parsing type AtomFeed struct { Title string `xml:"title"` Link []AtomLink `xml:"link"` Updated string `xml:"updated"` Entries []AtomEntry `xml:"entry"` } type AtomEntry struct { ID string `xml:"id"` Title string `xml:"title"` Links []AtomLink `xml:"link"` Summary string `xml:"summary"` Content AtomContent `xml:"content"` Author AtomAuthor `xml:"author"` Updated string `xml:"updated"` Published string `xml:"published"` Categories []AtomCategory `xml:"category"` } type AtomCategory struct { Term string `xml:"term,attr"` Label string `xml:"label,attr"` } type AtomContent struct { Type string `xml:"type,attr"` Value string `xml:",chardata"` } type AtomAuthor struct { Name string `xml:"name"` } type AtomLink struct { Href string `xml:"href,attr"` Rel string `xml:"rel,attr"` Type string `xml:"type,attr"` } // isPodcast checks if an RSS feed is a podcast based on content func isPodcast(ch RSSChannel) bool { // Check for iTunes namespace elements at channel level if ch.ITunesAuthor != "" || ch.ITunesOwner != "" || ch.ITunesExplicit != "" || ch.ITunesType != "" { return true } // Check items for audio enclosures or iTunes elements audioCount := 0 for _, item := range ch.Items { // Check for iTunes duration or episode number if item.ITunesDuration != "" || item.ITunesEpisode > 0 { return true } // Check for audio/video enclosure if item.Enclosure != nil && item.Enclosure.URL != "" { mimeType := strings.ToLower(item.Enclosure.Type) if strings.HasPrefix(mimeType, "audio/") || strings.HasPrefix(mimeType, "video/") || strings.Contains(mimeType, "mpeg") || strings.Contains(mimeType, "mp3") || strings.Contains(mimeType, "mp4") || strings.Contains(mimeType, "m4a") || strings.Contains(mimeType, "ogg") { audioCount++ } } } // If more than half the items have audio enclosures, it's a podcast if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 { return true } return false } func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item { var rss RSS if err := xml.Unmarshal([]byte(body), &rss); err != nil { return nil } ch := rss.Channel feed.Title = ch.Title feed.Description = ch.Description feed.Language = ch.Language feed.SiteURL = normalizeURL(ch.Link) feed.ItemCount = len(ch.Items) // Detect podcast if isPodcast(ch) { feed.Category = "podcast" } // Parse lastBuildDate if ch.LastBuildDate != "" { if t, err := parseRSSDate(ch.LastBuildDate); err == nil { feed.LastBuildDate = t } } // Parse items now := time.Now() var items []*Item var dates []time.Time for _, rssItem := range ch.Items { item := &Item{ FeedURL: feed.URL, Title: rssItem.Title, Link: rssItem.Link, Description: rssItem.Description, Content: rssItem.Content, DiscoveredAt: now, } // Use GUID if available, otherwise use link if rssItem.GUID != "" { item.GUID = rssItem.GUID } else if rssItem.Link != "" { item.GUID = rssItem.Link } // Author: prefer author, fall back to dc:creator if rssItem.Author != "" { item.Author = rssItem.Author } else if rssItem.Creator != "" { item.Author = rssItem.Creator } // Parse pubDate if rssItem.PubDate != "" { if t, err := parseRSSDate(rssItem.PubDate); err == nil { item.PubDate = t dates = append(dates, t) } } // Map enclosure if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" { item.Enclosure = &Enclosure{ URL: rssItem.Enclosure.URL, Type: rssItem.Enclosure.Type, Length: rssItem.Enclosure.Length, } } // Extract images from various sources item.ImageURLs = extractItemImages(rssItem) // Extract categories/tags if len(rssItem.Categories) > 0 { item.Tags = rssItem.Categories } items = append(items, item) } // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { if d.Before(oldest) { oldest = d } if d.After(newest) { newest = d } } feed.OldestItemDate = oldest feed.NewestItemDate = newest } return items } func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item { var atom AtomFeed if err := xml.Unmarshal([]byte(body), &atom); err != nil { return nil } feed.Title = atom.Title feed.ItemCount = len(atom.Entries) // Get site URL from links for _, link := range atom.Link { if link.Rel == "" || link.Rel == "alternate" { if link.Type == "" || strings.Contains(link.Type, "html") { feed.SiteURL = normalizeURL(link.Href) break } } } // Parse updated date if atom.Updated != "" { if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil { feed.LastBuildDate = t } } // Parse entries now := time.Now() var items []*Item var dates []time.Time for _, entry := range atom.Entries { item := &Item{ FeedURL: feed.URL, Title: entry.Title, Author: entry.Author.Name, DiscoveredAt: now, } // Use ID as GUID if entry.ID != "" { item.GUID = entry.ID } // Get link (prefer alternate, fall back to first link) for _, link := range entry.Links { if link.Rel == "" || link.Rel == "alternate" { item.Link = link.Href break } } if item.Link == "" && len(entry.Links) > 0 { item.Link = entry.Links[0].Href } // Use ID as GUID fallback if not set if item.GUID == "" && item.Link != "" { item.GUID = item.Link } // Summary/Content item.Description = entry.Summary item.Content = entry.Content.Value // Parse dates dateStr := entry.Updated if dateStr == "" { dateStr = entry.Published } if dateStr != "" { if t, err := time.Parse(time.RFC3339, dateStr); err == nil { item.PubDate = t dates = append(dates, t) } } // Extract categories/tags if len(entry.Categories) > 0 { for _, cat := range entry.Categories { // Prefer label, fall back to term tag := cat.Label if tag == "" { tag = cat.Term } if tag != "" { item.Tags = append(item.Tags, tag) } } } items = append(items, item) } // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { if d.Before(oldest) { oldest = d } if d.After(newest) { newest = d } } feed.OldestItemDate = oldest feed.NewestItemDate = newest } return items } // parseRSSDate attempts to parse various RSS date formats func parseRSSDate(s string) (time.Time, error) { formats := []string{ time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822, time.RFC3339, "Mon, 2 Jan 2006 15:04:05 -0700", "2006-01-02T15:04:05-07:00", "2006-01-02 15:04:05", } for _, format := range formats { if t, err := time.Parse(format, s); err == nil { return t, nil } } return time.Time{}, fmt.Errorf("unable to parse date: %s", s) } // calculateNextCheck determines when to next check this feed (feed_check) func (c *Crawler) calculateNextCheck(feed *Feed) time.Time { // Adaptive backoff: 100s base + 100s per consecutive no-change return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second) } // extractItemImages extracts image URLs from an RSS item // Sources: media:content, media:thumbnail, iTunes image, and tags in HTML func extractItemImages(rssItem RSSItem) []string { seen := make(map[string]bool) var images []string addImage := func(url string) { url = strings.TrimSpace(url) if url == "" || seen[url] { return } // Basic validation if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") { return } seen[url] = true images = append(images, url) } // 1. Media RSS content (prefer larger images) for _, mc := range rssItem.MediaContent { if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) { addImage(mc.URL) } } // 2. Media RSS thumbnails for _, mt := range rssItem.MediaThumbnail { if mt.URL != "" { addImage(mt.URL) } } // 3. iTunes image if rssItem.ITunesImage != "" { addImage(rssItem.ITunesImage) } // 4. Image enclosure if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") { addImage(rssItem.Enclosure.URL) } // 5. Extract tags from description and content htmlImages := extractImgTags(rssItem.Description) htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...) for _, img := range htmlImages { addImage(img) } return images } // extractImgTags extracts src URLs from tags in HTML func extractImgTags(html string) []string { if html == "" { return nil } var urls []string // Simple regex to find img src attributes // Matches: src="..." or src='...' imgRegex := regexp.MustCompile(`]+src\s*=\s*["']([^"']+)["']`) matches := imgRegex.FindAllStringSubmatch(html, -1) for _, match := range matches { if len(match) > 1 { url := strings.TrimSpace(match[1]) // Skip data URIs, tracking pixels, and tiny images if strings.HasPrefix(url, "data:") { continue } // Skip common tracking/spacer images if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") || strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") { continue } urls = append(urls, url) } } return urls } // JSON Feed structs (https://jsonfeed.org/version/1.1) type JSONFeed struct { Version string `json:"version"` Title string `json:"title"` HomePageURL string `json:"home_page_url"` FeedURL string `json:"feed_url"` Description string `json:"description"` Language string `json:"language"` Items []JSONFeedItem `json:"items"` } type JSONFeedItem struct { ID string `json:"id"` URL string `json:"url"` Title string `json:"title"` ContentHTML string `json:"content_html"` ContentText string `json:"content_text"` Summary string `json:"summary"` Image string `json:"image"` DatePublished string `json:"date_published"` DateModified string `json:"date_modified"` Authors []JSONFeedAuthor `json:"authors"` Tags []string `json:"tags"` Attachments []JSONFeedAttachment `json:"attachments"` } type JSONFeedAuthor struct { Name string `json:"name"` URL string `json:"url"` } type JSONFeedAttachment struct { URL string `json:"url"` MimeType string `json:"mime_type"` Size int64 `json:"size_in_bytes"` } func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item { var jf JSONFeed if err := json.Unmarshal([]byte(body), &jf); err != nil { return nil } feed.Title = jf.Title feed.Description = jf.Description feed.Language = jf.Language feed.SiteURL = normalizeURL(jf.HomePageURL) feed.ItemCount = len(jf.Items) // Parse items now := time.Now() var items []*Item var dates []time.Time for _, ji := range jf.Items { item := &Item{ FeedURL: feed.URL, Title: ji.Title, Link: ji.URL, DiscoveredAt: now, } // Use ID as GUID, fall back to URL if ji.ID != "" { item.GUID = ji.ID } else if ji.URL != "" { item.GUID = ji.URL } // Content: prefer HTML, fall back to text if ji.ContentHTML != "" { item.Content = ji.ContentHTML } else if ji.ContentText != "" { item.Content = ji.ContentText } item.Description = ji.Summary // Author if len(ji.Authors) > 0 { item.Author = ji.Authors[0].Name } // Parse date dateStr := ji.DatePublished if dateStr == "" { dateStr = ji.DateModified } if dateStr != "" { if t, err := time.Parse(time.RFC3339, dateStr); err == nil { item.PubDate = t dates = append(dates, t) } } // Images if ji.Image != "" { item.ImageURLs = []string{ji.Image} } // Tags if len(ji.Tags) > 0 { item.Tags = ji.Tags } // Attachments (enclosures) for _, att := range ji.Attachments { if att.URL != "" { item.Enclosure = &Enclosure{ URL: att.URL, Type: att.MimeType, Length: att.Size, } break // Only use first attachment as enclosure } } items = append(items, item) } // Calculate date stats if len(dates) > 0 { oldest, newest := dates[0], dates[0] for _, d := range dates { if d.Before(oldest) { oldest = d } if d.After(newest) { newest = d } } feed.OldestItemDate = oldest feed.NewestItemDate = newest } return items }