Add AT Protocol publishing, media support, and SQLite stability

Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 15:30:02 -05:00
parent aa6f571215
commit 75835d771d
11 changed files with 3723 additions and 635 deletions
@@ -3,6 +3,7 @@ package main
 import (
 	"encoding/xml"
 	"fmt"
+	"regexp"
 	"strings"
 	"time"
 )
@@ -23,17 +24,52 @@ type RSSChannel struct {
 	UpdatePeriod  string    `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
 	UpdateFreq    int       `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
 	Items         []RSSItem `xml:"item"`
+	// iTunes podcast namespace
+	ITunesAuthor   string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
+	ITunesOwner    string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
+	ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
+	ITunesType     string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
 }

 type RSSItem struct {
-	Title       string `xml:"title"`
-	Link        string `xml:"link"`
-	GUID        string `xml:"guid"`
-	Description string `xml:"description"`
-	Content     string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
-	Author      string `xml:"author"`
-	Creator     string `xml:"http://purl.org/dc/elements/1.1/ creator"`
-	PubDate     string `xml:"pubDate"`
+	Title       string        `xml:"title"`
+	Link        string        `xml:"link"`
+	GUID        string        `xml:"guid"`
+	Description string        `xml:"description"`
+	Content     string        `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
+	Author      string        `xml:"author"`
+	Creator     string        `xml:"http://purl.org/dc/elements/1.1/ creator"`
+	PubDate     string        `xml:"pubDate"`
+	Enclosure   *RSSEnclosure `xml:"enclosure"`
+	// iTunes item elements
+	ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
+	ITunesEpisode  int    `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
+	ITunesImage    string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
+	// Media RSS elements
+	MediaContent   []MediaContent   `xml:"http://search.yahoo.com/mrss/ content"`
+	MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
+}
+
+// MediaContent represents a media:content element
+type MediaContent struct {
+	URL    string `xml:"url,attr"`
+	Type   string `xml:"type,attr"`
+	Medium string `xml:"medium,attr"` // image, video, audio
+	Width  int    `xml:"width,attr"`
+	Height int    `xml:"height,attr"`
+}
+
+// MediaThumbnail represents a media:thumbnail element
+type MediaThumbnail struct {
+	URL    string `xml:"url,attr"`
+	Width  int    `xml:"width,attr"`
+	Height int    `xml:"height,attr"`
+}
+
+type RSSEnclosure struct {
+	URL    string `xml:"url,attr"`
+	Type   string `xml:"type,attr"`
+	Length int64  `xml:"length,attr"`
 }

 // Atom structs for parsing
@@ -70,6 +106,43 @@ type AtomLink struct {
 	Type string `xml:"type,attr"`
 }

+// isPodcast checks if an RSS feed is a podcast based on content
+func isPodcast(ch RSSChannel) bool {
+	// Check for iTunes namespace elements at channel level
+	if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
+		ch.ITunesExplicit != "" || ch.ITunesType != "" {
+		return true
+	}
+
+	// Check items for audio enclosures or iTunes elements
+	audioCount := 0
+	for _, item := range ch.Items {
+		// Check for iTunes duration or episode number
+		if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
+			return true
+		}
+		// Check for audio/video enclosure
+		if item.Enclosure != nil && item.Enclosure.URL != "" {
+			mimeType := strings.ToLower(item.Enclosure.Type)
+			if strings.HasPrefix(mimeType, "audio/") ||
+				strings.HasPrefix(mimeType, "video/") ||
+				strings.Contains(mimeType, "mpeg") ||
+				strings.Contains(mimeType, "mp3") ||
+				strings.Contains(mimeType, "mp4") ||
+				strings.Contains(mimeType, "m4a") ||
+				strings.Contains(mimeType, "ogg") {
+				audioCount++
+			}
+		}
+	}
+	// If more than half the items have audio enclosures, it's a podcast
+	if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
+		return true
+	}
+
+	return false
+}
+
 func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
 	var rss RSS
 	if err := xml.Unmarshal([]byte(body), &rss); err != nil {
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
 	}

 	ch := rss.Channel
+
 	feed.Title = ch.Title
 	feed.Description = ch.Description
 	feed.Language = ch.Language
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
 	feed.UpdateFreq = ch.UpdateFreq
 	feed.ItemCount = len(ch.Items)

+	// Detect podcast
+	if isPodcast(ch) {
+		feed.Category = "podcast"
+	}
+
 	// Parse lastBuildDate
 	if ch.LastBuildDate != "" {
 		if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
 			}
 		}

+		// Map enclosure
+		if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
+			item.Enclosure = &Enclosure{
+				URL:    rssItem.Enclosure.URL,
+				Type:   rssItem.Enclosure.Type,
+				Length: rssItem.Enclosure.Length,
+			}
+		}
+
+		// Extract images from various sources
+		item.ImageURLs = extractItemImages(rssItem)
+
 		items = append(items, item)
 	}

@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
 	// Default: crawl every 6 hours
 	return now.Add(6 * time.Hour)
 }
+
+// extractItemImages extracts image URLs from an RSS item
+// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
+func extractItemImages(rssItem RSSItem) []string {
+	seen := make(map[string]bool)
+	var images []string
+
+	addImage := func(url string) {
+		url = strings.TrimSpace(url)
+		if url == "" || seen[url] {
+			return
+		}
+		// Basic validation
+		if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
+			return
+		}
+		seen[url] = true
+		images = append(images, url)
+	}
+
+	// 1. Media RSS content (prefer larger images)
+	for _, mc := range rssItem.MediaContent {
+		if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
+			addImage(mc.URL)
+		}
+	}
+
+	// 2. Media RSS thumbnails
+	for _, mt := range rssItem.MediaThumbnail {
+		if mt.URL != "" {
+			addImage(mt.URL)
+		}
+	}
+
+	// 3. iTunes image
+	if rssItem.ITunesImage != "" {
+		addImage(rssItem.ITunesImage)
+	}
+
+	// 4. Image enclosure
+	if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
+		addImage(rssItem.Enclosure.URL)
+	}
+
+	// 5. Extract <img> tags from description and content
+	htmlImages := extractImgTags(rssItem.Description)
+	htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
+	for _, img := range htmlImages {
+		addImage(img)
+	}
+
+	return images
+}
+
+// extractImgTags extracts src URLs from <img> tags in HTML
+func extractImgTags(html string) []string {
+	if html == "" {
+		return nil
+	}
+
+	var urls []string
+
+	// Simple regex to find img src attributes
+	// Matches: src="..." or src='...'
+	imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
+	matches := imgRegex.FindAllStringSubmatch(html, -1)
+
+	for _, match := range matches {
+		if len(match) > 1 {
+			url := strings.TrimSpace(match[1])
+			// Skip data URIs, tracking pixels, and tiny images
+			if strings.HasPrefix(url, "data:") {
+				continue
+			}
+			// Skip common tracking/spacer images
+			if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
+				strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
+				continue
+			}
+			urls = append(urls, url)
+		}
+	}
+
+	return urls
+}