Add AT Protocol publishing, media support, and SQLite stability
Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -23,17 +24,52 @@ type RSSChannel struct {
|
||||
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
|
||||
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
|
||||
Items []RSSItem `xml:"item"`
|
||||
// iTunes podcast namespace
|
||||
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
|
||||
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
|
||||
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
|
||||
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
|
||||
}
|
||||
|
||||
type RSSItem struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
GUID string `xml:"guid"`
|
||||
Description string `xml:"description"`
|
||||
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||
Author string `xml:"author"`
|
||||
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
GUID string `xml:"guid"`
|
||||
Description string `xml:"description"`
|
||||
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||
Author string `xml:"author"`
|
||||
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Enclosure *RSSEnclosure `xml:"enclosure"`
|
||||
// iTunes item elements
|
||||
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
|
||||
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
|
||||
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
|
||||
// Media RSS elements
|
||||
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
|
||||
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
|
||||
}
|
||||
|
||||
// MediaContent represents a media:content element
|
||||
type MediaContent struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
Medium string `xml:"medium,attr"` // image, video, audio
|
||||
Width int `xml:"width,attr"`
|
||||
Height int `xml:"height,attr"`
|
||||
}
|
||||
|
||||
// MediaThumbnail represents a media:thumbnail element
|
||||
type MediaThumbnail struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Width int `xml:"width,attr"`
|
||||
Height int `xml:"height,attr"`
|
||||
}
|
||||
|
||||
type RSSEnclosure struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
Length int64 `xml:"length,attr"`
|
||||
}
|
||||
|
||||
// Atom structs for parsing
|
||||
@@ -70,6 +106,43 @@ type AtomLink struct {
|
||||
Type string `xml:"type,attr"`
|
||||
}
|
||||
|
||||
// isPodcast checks if an RSS feed is a podcast based on content
|
||||
func isPodcast(ch RSSChannel) bool {
|
||||
// Check for iTunes namespace elements at channel level
|
||||
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
|
||||
ch.ITunesExplicit != "" || ch.ITunesType != "" {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check items for audio enclosures or iTunes elements
|
||||
audioCount := 0
|
||||
for _, item := range ch.Items {
|
||||
// Check for iTunes duration or episode number
|
||||
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
|
||||
return true
|
||||
}
|
||||
// Check for audio/video enclosure
|
||||
if item.Enclosure != nil && item.Enclosure.URL != "" {
|
||||
mimeType := strings.ToLower(item.Enclosure.Type)
|
||||
if strings.HasPrefix(mimeType, "audio/") ||
|
||||
strings.HasPrefix(mimeType, "video/") ||
|
||||
strings.Contains(mimeType, "mpeg") ||
|
||||
strings.Contains(mimeType, "mp3") ||
|
||||
strings.Contains(mimeType, "mp4") ||
|
||||
strings.Contains(mimeType, "m4a") ||
|
||||
strings.Contains(mimeType, "ogg") {
|
||||
audioCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
// If more than half the items have audio enclosures, it's a podcast
|
||||
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
var rss RSS
|
||||
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
|
||||
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
}
|
||||
|
||||
ch := rss.Channel
|
||||
|
||||
feed.Title = ch.Title
|
||||
feed.Description = ch.Description
|
||||
feed.Language = ch.Language
|
||||
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
feed.UpdateFreq = ch.UpdateFreq
|
||||
feed.ItemCount = len(ch.Items)
|
||||
|
||||
// Detect podcast
|
||||
if isPodcast(ch) {
|
||||
feed.Category = "podcast"
|
||||
}
|
||||
|
||||
// Parse lastBuildDate
|
||||
if ch.LastBuildDate != "" {
|
||||
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
|
||||
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
}
|
||||
}
|
||||
|
||||
// Map enclosure
|
||||
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: rssItem.Enclosure.URL,
|
||||
Type: rssItem.Enclosure.Type,
|
||||
Length: rssItem.Enclosure.Length,
|
||||
}
|
||||
}
|
||||
|
||||
// Extract images from various sources
|
||||
item.ImageURLs = extractItemImages(rssItem)
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
|
||||
// Default: crawl every 6 hours
|
||||
return now.Add(6 * time.Hour)
|
||||
}
|
||||
|
||||
// extractItemImages extracts image URLs from an RSS item
|
||||
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
|
||||
func extractItemImages(rssItem RSSItem) []string {
|
||||
seen := make(map[string]bool)
|
||||
var images []string
|
||||
|
||||
addImage := func(url string) {
|
||||
url = strings.TrimSpace(url)
|
||||
if url == "" || seen[url] {
|
||||
return
|
||||
}
|
||||
// Basic validation
|
||||
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
|
||||
return
|
||||
}
|
||||
seen[url] = true
|
||||
images = append(images, url)
|
||||
}
|
||||
|
||||
// 1. Media RSS content (prefer larger images)
|
||||
for _, mc := range rssItem.MediaContent {
|
||||
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
|
||||
addImage(mc.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Media RSS thumbnails
|
||||
for _, mt := range rssItem.MediaThumbnail {
|
||||
if mt.URL != "" {
|
||||
addImage(mt.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. iTunes image
|
||||
if rssItem.ITunesImage != "" {
|
||||
addImage(rssItem.ITunesImage)
|
||||
}
|
||||
|
||||
// 4. Image enclosure
|
||||
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
|
||||
addImage(rssItem.Enclosure.URL)
|
||||
}
|
||||
|
||||
// 5. Extract <img> tags from description and content
|
||||
htmlImages := extractImgTags(rssItem.Description)
|
||||
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
|
||||
for _, img := range htmlImages {
|
||||
addImage(img)
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// extractImgTags extracts src URLs from <img> tags in HTML
|
||||
func extractImgTags(html string) []string {
|
||||
if html == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
var urls []string
|
||||
|
||||
// Simple regex to find img src attributes
|
||||
// Matches: src="..." or src='...'
|
||||
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
|
||||
matches := imgRegex.FindAllStringSubmatch(html, -1)
|
||||
|
||||
for _, match := range matches {
|
||||
if len(match) > 1 {
|
||||
url := strings.TrimSpace(match[1])
|
||||
// Skip data URIs, tracking pixels, and tiny images
|
||||
if strings.HasPrefix(url, "data:") {
|
||||
continue
|
||||
}
|
||||
// Skip common tracking/spacer images
|
||||
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
|
||||
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
|
||||
continue
|
||||
}
|
||||
urls = append(urls, url)
|
||||
}
|
||||
}
|
||||
|
||||
return urls
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user