Add AT Protocol publishing, media support, and SQLite stability

Publishing:
- Add publisher.go for posting feed items to AT Protocol PDS
- Support deterministic rkeys from SHA256(guid + discoveredAt)
- Handle multiple URLs in posts with facets for each link
- Image embed support (app.bsky.embed.images) for up to 4 images
- External embed with thumbnail fallback
- Podcast/audio enclosure URLs included in post text

Media extraction:
- Parse RSS enclosures (audio, video, images)
- Extract Media RSS content and thumbnails
- Extract images from HTML content in descriptions
- Store enclosure and imageUrls in items table

SQLite stability improvements:
- Add synchronous=NORMAL and wal_autocheckpoint pragmas
- Connection pool tuning (idle conns, max lifetime)
- Periodic WAL checkpoint every 5 minutes
- Hourly integrity checks with PRAGMA quick_check
- Daily hot backup via VACUUM INTO
- Docker stop_grace_period: 30s for graceful shutdown

Dashboard:
- Feed publishing UI and API endpoints
- Account creation with invite codes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-28 15:30:02 -05:00
parent aa6f571215
commit 75835d771d
11 changed files with 3723 additions and 635 deletions
+184 -8
View File
@@ -3,6 +3,7 @@ package main
import (
"encoding/xml"
"fmt"
"regexp"
"strings"
"time"
)
@@ -23,17 +24,52 @@ type RSSChannel struct {
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
Items []RSSItem `xml:"item"`
// iTunes podcast namespace
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Enclosure *RSSEnclosure `xml:"enclosure"`
// iTunes item elements
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
// Media RSS elements
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
}
// MediaContent represents a media:content element
type MediaContent struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Medium string `xml:"medium,attr"` // image, video, audio
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
// MediaThumbnail represents a media:thumbnail element
type MediaThumbnail struct {
URL string `xml:"url,attr"`
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
type RSSEnclosure struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Length int64 `xml:"length,attr"`
}
// Atom structs for parsing
@@ -70,6 +106,43 @@ type AtomLink struct {
Type string `xml:"type,attr"`
}
// isPodcast checks if an RSS feed is a podcast based on content
func isPodcast(ch RSSChannel) bool {
// Check for iTunes namespace elements at channel level
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
ch.ITunesExplicit != "" || ch.ITunesType != "" {
return true
}
// Check items for audio enclosures or iTunes elements
audioCount := 0
for _, item := range ch.Items {
// Check for iTunes duration or episode number
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
return true
}
// Check for audio/video enclosure
if item.Enclosure != nil && item.Enclosure.URL != "" {
mimeType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(mimeType, "audio/") ||
strings.HasPrefix(mimeType, "video/") ||
strings.Contains(mimeType, "mpeg") ||
strings.Contains(mimeType, "mp3") ||
strings.Contains(mimeType, "mp4") ||
strings.Contains(mimeType, "m4a") ||
strings.Contains(mimeType, "ogg") {
audioCount++
}
}
}
// If more than half the items have audio enclosures, it's a podcast
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
return true
}
return false
}
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
ch := rss.Channel
feed.Title = ch.Title
feed.Description = ch.Description
feed.Language = ch.Language
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
// Detect podcast
if isPodcast(ch) {
feed.Category = "podcast"
}
// Parse lastBuildDate
if ch.LastBuildDate != "" {
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
}
// Map enclosure
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
item.Enclosure = &Enclosure{
URL: rssItem.Enclosure.URL,
Type: rssItem.Enclosure.Type,
Length: rssItem.Enclosure.Length,
}
}
// Extract images from various sources
item.ImageURLs = extractItemImages(rssItem)
items = append(items, item)
}
@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
}
// extractItemImages extracts image URLs from an RSS item
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
func extractItemImages(rssItem RSSItem) []string {
seen := make(map[string]bool)
var images []string
addImage := func(url string) {
url = strings.TrimSpace(url)
if url == "" || seen[url] {
return
}
// Basic validation
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
return
}
seen[url] = true
images = append(images, url)
}
// 1. Media RSS content (prefer larger images)
for _, mc := range rssItem.MediaContent {
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
addImage(mc.URL)
}
}
// 2. Media RSS thumbnails
for _, mt := range rssItem.MediaThumbnail {
if mt.URL != "" {
addImage(mt.URL)
}
}
// 3. iTunes image
if rssItem.ITunesImage != "" {
addImage(rssItem.ITunesImage)
}
// 4. Image enclosure
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
addImage(rssItem.Enclosure.URL)
}
// 5. Extract <img> tags from description and content
htmlImages := extractImgTags(rssItem.Description)
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
for _, img := range htmlImages {
addImage(img)
}
return images
}
// extractImgTags extracts src URLs from <img> tags in HTML
func extractImgTags(html string) []string {
if html == "" {
return nil
}
var urls []string
// Simple regex to find img src attributes
// Matches: src="..." or src='...'
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
matches := imgRegex.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) > 1 {
url := strings.TrimSpace(match[1])
// Skip data URIs, tracking pixels, and tiny images
if strings.HasPrefix(url, "data:") {
continue
}
// Skip common tracking/spacer images
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
continue
}
urls = append(urls, url)
}
}
return urls
}