crawler/parser.go

package main

import (
	"encoding/json"
	"encoding/xml"
	"fmt"
	"regexp"
	"strings"
	"time"
)

// RSS structs for parsing
type RSS struct {
	Channel RSSChannel `xml:"channel"`
}

type RSSChannel struct {
	Title         string    `xml:"title"`
	Link          string    `xml:"link"`
	Description   string    `xml:"description"`
	Language      string    `xml:"language"`
	LastBuildDate string    `xml:"lastBuildDate"`
	PubDate       string    `xml:"pubDate"`
	TTL           int       `xml:"ttl"`
	UpdatePeriod  string    `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
	UpdateFreq    int       `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
	Items         []RSSItem `xml:"item"`
	// iTunes podcast namespace
	ITunesAuthor   string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
	ITunesOwner    string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
	ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
	ITunesType     string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
}

type RSSItem struct {
	Title       string        `xml:"title"`
	Link        string        `xml:"link"`
	GUID        string        `xml:"guid"`
	Description string        `xml:"description"`
	Content     string        `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
	Author      string        `xml:"author"`
	Creator     string        `xml:"http://purl.org/dc/elements/1.1/ creator"`
	PubDate     string        `xml:"pubDate"`
	Enclosure   *RSSEnclosure `xml:"enclosure"`
	// iTunes item elements
	ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
	ITunesEpisode  int    `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
	ITunesImage    string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
	// Media RSS elements
	MediaContent   []MediaContent   `xml:"http://search.yahoo.com/mrss/ content"`
	MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
}

// MediaContent represents a media:content element
type MediaContent struct {
	URL    string `xml:"url,attr"`
	Type   string `xml:"type,attr"`
	Medium string `xml:"medium,attr"` // image, video, audio
	Width  int    `xml:"width,attr"`
	Height int    `xml:"height,attr"`
}

// MediaThumbnail represents a media:thumbnail element
type MediaThumbnail struct {
	URL    string `xml:"url,attr"`
	Width  int    `xml:"width,attr"`
	Height int    `xml:"height,attr"`
}

type RSSEnclosure struct {
	URL    string `xml:"url,attr"`
	Type   string `xml:"type,attr"`
	Length int64  `xml:"length,attr"`
}

// Atom structs for parsing
type AtomFeed struct {
	Title   string      `xml:"title"`
	Link    []AtomLink  `xml:"link"`
	Updated string      `xml:"updated"`
	Entries []AtomEntry `xml:"entry"`
}

type AtomEntry struct {
	ID        string      `xml:"id"`
	Title     string      `xml:"title"`
	Links     []AtomLink  `xml:"link"`
	Summary   string      `xml:"summary"`
	Content   AtomContent `xml:"content"`
	Author    AtomAuthor  `xml:"author"`
	Updated   string      `xml:"updated"`
	Published string      `xml:"published"`
}

type AtomContent struct {
	Type  string `xml:"type,attr"`
	Value string `xml:",chardata"`
}

type AtomAuthor struct {
	Name string `xml:"name"`
}

type AtomLink struct {
	Href string `xml:"href,attr"`
	Rel  string `xml:"rel,attr"`
	Type string `xml:"type,attr"`
}

// isPodcast checks if an RSS feed is a podcast based on content
func isPodcast(ch RSSChannel) bool {
	// Check for iTunes namespace elements at channel level
	if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
		ch.ITunesExplicit != "" || ch.ITunesType != "" {
		return true
	}

	// Check items for audio enclosures or iTunes elements
	audioCount := 0
	for _, item := range ch.Items {
		// Check for iTunes duration or episode number
		if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
			return true
		}
		// Check for audio/video enclosure
		if item.Enclosure != nil && item.Enclosure.URL != "" {
			mimeType := strings.ToLower(item.Enclosure.Type)
			if strings.HasPrefix(mimeType, "audio/") ||
				strings.HasPrefix(mimeType, "video/") ||
				strings.Contains(mimeType, "mpeg") ||
				strings.Contains(mimeType, "mp3") ||
				strings.Contains(mimeType, "mp4") ||
				strings.Contains(mimeType, "m4a") ||
				strings.Contains(mimeType, "ogg") {
				audioCount++
			}
		}
	}
	// If more than half the items have audio enclosures, it's a podcast
	if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
		return true
	}

	return false
}

func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
	var rss RSS
	if err := xml.Unmarshal([]byte(body), &rss); err != nil {
		return nil
	}

	ch := rss.Channel

	feed.Title = ch.Title
	feed.Description = ch.Description
	feed.Language = ch.Language
	feed.SiteURL = normalizeURL(ch.Link)
	feed.TTLMinutes = ch.TTL
	feed.UpdatePeriod = ch.UpdatePeriod
	feed.UpdateFreq = ch.UpdateFreq
	feed.ItemCount = len(ch.Items)

	// Detect podcast
	if isPodcast(ch) {
		feed.Category = "podcast"
	}

	// Parse lastBuildDate
	if ch.LastBuildDate != "" {
		if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
			feed.LastBuildDate = t
		}
	}

	// Parse items
	now := time.Now()
	var items []*Item
	var dates []time.Time

	for _, rssItem := range ch.Items {
		item := &Item{
			FeedURL:      feed.URL,
			Title:        rssItem.Title,
			Link:         rssItem.Link,
			Description:  rssItem.Description,
			Content:      rssItem.Content,
			DiscoveredAt: now,
		}

		// Use GUID if available, otherwise use link
		if rssItem.GUID != "" {
			item.GUID = rssItem.GUID
		} else if rssItem.Link != "" {
			item.GUID = rssItem.Link
		}

		// Author: prefer author, fall back to dc:creator
		if rssItem.Author != "" {
			item.Author = rssItem.Author
		} else if rssItem.Creator != "" {
			item.Author = rssItem.Creator
		}

		// Parse pubDate
		if rssItem.PubDate != "" {
			if t, err := parseRSSDate(rssItem.PubDate); err == nil {
				item.PubDate = t
				dates = append(dates, t)
			}
		}

		// Map enclosure
		if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
			item.Enclosure = &Enclosure{
				URL:    rssItem.Enclosure.URL,
				Type:   rssItem.Enclosure.Type,
				Length: rssItem.Enclosure.Length,
			}
		}

		// Extract images from various sources
		item.ImageURLs = extractItemImages(rssItem)

		items = append(items, item)
	}

	// Calculate date stats
	if len(dates) > 0 {
		oldest, newest := dates[0], dates[0]
		for _, d := range dates {
			if d.Before(oldest) {
				oldest = d
			}
			if d.After(newest) {
				newest = d
			}
		}
		feed.OldestItemDate = oldest
		feed.NewestItemDate = newest

		if len(dates) > 1 {
			totalHours := newest.Sub(oldest).Hours()
			feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
		}
	}

	return items
}

func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
	var atom AtomFeed
	if err := xml.Unmarshal([]byte(body), &atom); err != nil {
		return nil
	}

	feed.Title = atom.Title
	feed.ItemCount = len(atom.Entries)

	// Get site URL from links
	for _, link := range atom.Link {
		if link.Rel == "" || link.Rel == "alternate" {
			if link.Type == "" || strings.Contains(link.Type, "html") {
				feed.SiteURL = normalizeURL(link.Href)
				break
			}
		}
	}

	// Parse updated date
	if atom.Updated != "" {
		if t, err := time.Parse(time.RFC3339, atom.Updated); err == nil {
			feed.LastBuildDate = t
		}
	}

	// Parse entries
	now := time.Now()
	var items []*Item
	var dates []time.Time

	for _, entry := range atom.Entries {
		item := &Item{
			FeedURL:      feed.URL,
			Title:        entry.Title,
			Author:       entry.Author.Name,
			DiscoveredAt: now,
		}

		// Use ID as GUID
		if entry.ID != "" {
			item.GUID = entry.ID
		}

		// Get link (prefer alternate, fall back to first link)
		for _, link := range entry.Links {
			if link.Rel == "" || link.Rel == "alternate" {
				item.Link = link.Href
				break
			}
		}
		if item.Link == "" && len(entry.Links) > 0 {
			item.Link = entry.Links[0].Href
		}

		// Use ID as GUID fallback if not set
		if item.GUID == "" && item.Link != "" {
			item.GUID = item.Link
		}

		// Summary/Content
		item.Description = entry.Summary
		item.Content = entry.Content.Value

		// Parse dates
		dateStr := entry.Updated
		if dateStr == "" {
			dateStr = entry.Published
		}
		if dateStr != "" {
			if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
				item.PubDate = t
				dates = append(dates, t)
			}
		}

		items = append(items, item)
	}

	// Calculate date stats
	if len(dates) > 0 {
		oldest, newest := dates[0], dates[0]
		for _, d := range dates {
			if d.Before(oldest) {
				oldest = d
			}
			if d.After(newest) {
				newest = d
			}
		}
		feed.OldestItemDate = oldest
		feed.NewestItemDate = newest

		if len(dates) > 1 {
			totalHours := newest.Sub(oldest).Hours()
			feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
		}
	}

	return items
}

// parseRSSDate attempts to parse various RSS date formats
func parseRSSDate(s string) (time.Time, error) {
	formats := []string{
		time.RFC1123Z,
		time.RFC1123,
		time.RFC822Z,
		time.RFC822,
		time.RFC3339,
		"Mon, 2 Jan 2006 15:04:05 -0700",
		"2006-01-02T15:04:05-07:00",
		"2006-01-02 15:04:05",
	}

	for _, format := range formats {
		if t, err := time.Parse(format, s); err == nil {
			return t, nil
		}
	}
	return time.Time{}, fmt.Errorf("unable to parse date: %s", s)
}

// calculateNextCrawl determines when to next crawl this feed
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
	now := time.Now()

	// If TTL is specified, use it
	if feed.TTLMinutes > 0 {
		return now.Add(time.Duration(feed.TTLMinutes) * time.Minute)
	}

	// If updatePeriod is specified
	if feed.UpdatePeriod != "" {
		freq := feed.UpdateFreq
		if freq == 0 {
			freq = 1
		}
		switch strings.ToLower(feed.UpdatePeriod) {
		case "hourly":
			return now.Add(time.Duration(freq) * time.Hour)
		case "daily":
			return now.Add(time.Duration(freq) * 24 * time.Hour)
		case "weekly":
			return now.Add(time.Duration(freq) * 7 * 24 * time.Hour)
		case "monthly":
			return now.Add(time.Duration(freq) * 30 * 24 * time.Hour)
		case "yearly":
			return now.Add(time.Duration(freq) * 365 * 24 * time.Hour)
		}
	}

	// If we have average post frequency, use that
	if feed.AvgPostFreqHrs > 0 {
		// Crawl at half the average frequency, but at least every hour and at most once per day
		crawlInterval := feed.AvgPostFreqHrs / 2
		if crawlInterval < 1 {
			crawlInterval = 1
		}
		if crawlInterval > 24 {
			crawlInterval = 24
		}
		return now.Add(time.Duration(crawlInterval * float64(time.Hour)))
	}

	// Default: crawl every 6 hours
	return now.Add(6 * time.Hour)
}

// extractItemImages extracts image URLs from an RSS item
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
func extractItemImages(rssItem RSSItem) []string {
	seen := make(map[string]bool)
	var images []string

	addImage := func(url string) {
		url = strings.TrimSpace(url)
		if url == "" || seen[url] {
			return
		}
		// Basic validation
		if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
			return
		}
		seen[url] = true
		images = append(images, url)
	}

	// 1. Media RSS content (prefer larger images)
	for _, mc := range rssItem.MediaContent {
		if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
			addImage(mc.URL)
		}
	}

	// 2. Media RSS thumbnails
	for _, mt := range rssItem.MediaThumbnail {
		if mt.URL != "" {
			addImage(mt.URL)
		}
	}

	// 3. iTunes image
	if rssItem.ITunesImage != "" {
		addImage(rssItem.ITunesImage)
	}

	// 4. Image enclosure
	if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
		addImage(rssItem.Enclosure.URL)
	}

	// 5. Extract <img> tags from description and content
	htmlImages := extractImgTags(rssItem.Description)
	htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
	for _, img := range htmlImages {
		addImage(img)
	}

	return images
}

// extractImgTags extracts src URLs from <img> tags in HTML
func extractImgTags(html string) []string {
	if html == "" {
		return nil
	}

	var urls []string

	// Simple regex to find img src attributes
	// Matches: src="..." or src='...'
	imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
	matches := imgRegex.FindAllStringSubmatch(html, -1)

	for _, match := range matches {
		if len(match) > 1 {
			url := strings.TrimSpace(match[1])
			// Skip data URIs, tracking pixels, and tiny images
			if strings.HasPrefix(url, "data:") {
				continue
			}
			// Skip common tracking/spacer images
			if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
				strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
				continue
			}
			urls = append(urls, url)
		}
	}

	return urls
}

// JSON Feed structs (https://jsonfeed.org/version/1.1)
type JSONFeed struct {
	Version     string         `json:"version"`
	Title       string         `json:"title"`
	HomePageURL string         `json:"home_page_url"`
	FeedURL     string         `json:"feed_url"`
	Description string         `json:"description"`
	Language    string         `json:"language"`
	Items       []JSONFeedItem `json:"items"`
}

type JSONFeedItem struct {
	ID            string           `json:"id"`
	URL           string           `json:"url"`
	Title         string           `json:"title"`
	ContentHTML   string           `json:"content_html"`
	ContentText   string           `json:"content_text"`
	Summary       string           `json:"summary"`
	Image         string           `json:"image"`
	DatePublished string           `json:"date_published"`
	DateModified  string           `json:"date_modified"`
	Authors       []JSONFeedAuthor `json:"authors"`
	Attachments   []JSONFeedAttachment `json:"attachments"`
}

type JSONFeedAuthor struct {
	Name string `json:"name"`
	URL  string `json:"url"`
}

type JSONFeedAttachment struct {
	URL      string `json:"url"`
	MimeType string `json:"mime_type"`
	Size     int64  `json:"size_in_bytes"`
}

func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
	var jf JSONFeed
	if err := json.Unmarshal([]byte(body), &jf); err != nil {
		return nil
	}

	feed.Title = jf.Title
	feed.Description = jf.Description
	feed.Language = jf.Language
	feed.SiteURL = normalizeURL(jf.HomePageURL)
	feed.ItemCount = len(jf.Items)

	// Parse items
	now := time.Now()
	var items []*Item
	var dates []time.Time

	for _, ji := range jf.Items {
		item := &Item{
			FeedURL:      feed.URL,
			Title:        ji.Title,
			Link:         ji.URL,
			DiscoveredAt: now,
		}

		// Use ID as GUID, fall back to URL
		if ji.ID != "" {
			item.GUID = ji.ID
		} else if ji.URL != "" {
			item.GUID = ji.URL
		}

		// Content: prefer HTML, fall back to text
		if ji.ContentHTML != "" {
			item.Content = ji.ContentHTML
		} else if ji.ContentText != "" {
			item.Content = ji.ContentText
		}
		item.Description = ji.Summary

		// Author
		if len(ji.Authors) > 0 {
			item.Author = ji.Authors[0].Name
		}

		// Parse date
		dateStr := ji.DatePublished
		if dateStr == "" {
			dateStr = ji.DateModified
		}
		if dateStr != "" {
			if t, err := time.Parse(time.RFC3339, dateStr); err == nil {
				item.PubDate = t
				dates = append(dates, t)
			}
		}

		// Images
		if ji.Image != "" {
			item.ImageURLs = []string{ji.Image}
		}

		// Attachments (enclosures)
		for _, att := range ji.Attachments {
			if att.URL != "" {
				item.Enclosure = &Enclosure{
					URL:    att.URL,
					Type:   att.MimeType,
					Length: att.Size,
				}
				break // Only use first attachment as enclosure
			}
		}

		items = append(items, item)
	}

	// Calculate date stats
	if len(dates) > 0 {
		oldest, newest := dates[0], dates[0]
		for _, d := range dates {
			if d.Before(oldest) {
				oldest = d
			}
			if d.After(newest) {
				newest = d
			}
		}
		feed.OldestItemDate = oldest
		feed.NewestItemDate = newest

		if len(dates) > 1 {
			totalHours := newest.Sub(oldest).Hours()
			feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
		}
	}

	return items
}