Add AT Protocol publishing, media support, and SQLite stability

Publishing:
- Add publisher.go for posting feed items to AT Protocol PDS
- Support deterministic rkeys from SHA256(guid + discoveredAt)
- Handle multiple URLs in posts with facets for each link
- Image embed support (app.bsky.embed.images) for up to 4 images
- External embed with thumbnail fallback
- Podcast/audio enclosure URLs included in post text

Media extraction:
- Parse RSS enclosures (audio, video, images)
- Extract Media RSS content and thumbnails
- Extract images from HTML content in descriptions
- Store enclosure and imageUrls in items table

SQLite stability improvements:
- Add synchronous=NORMAL and wal_autocheckpoint pragmas
- Connection pool tuning (idle conns, max lifetime)
- Periodic WAL checkpoint every 5 minutes
- Hourly integrity checks with PRAGMA quick_check
- Daily hot backup via VACUUM INTO
- Docker stop_grace_period: 30s for graceful shutdown

Dashboard:
- Feed publishing UI and API endpoints
- Account creation with invite codes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-28 15:30:02 -05:00
parent aa6f571215
commit 75835d771d
11 changed files with 3723 additions and 635 deletions
+60 -3
View File
@@ -61,6 +61,13 @@ func NewCrawler(dbPath string) (*Crawler, error) {
func (c *Crawler) Close() error {
if c.db != nil {
// Checkpoint WAL to merge it back into main database before closing
// This prevents corruption if the container is stopped mid-write
fmt.Println("Checkpointing WAL...")
if _, err := c.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)"); err != nil {
fmt.Printf("WAL checkpoint warning: %v\n", err)
}
fmt.Println("Closing database...")
return c.db.Close()
}
return nil
@@ -87,6 +94,56 @@ func (c *Crawler) StartCleanupLoop() {
}
}
// StartMaintenanceLoop performs periodic database maintenance
// - WAL checkpoint every 5 minutes to prevent WAL bloat and reduce corruption risk
// - Quick integrity check every hour to detect issues early
// - Hot backup every 24 hours for recovery
func (c *Crawler) StartMaintenanceLoop() {
checkpointTicker := time.NewTicker(5 * time.Minute)
integrityTicker := time.NewTicker(1 * time.Hour)
backupTicker := time.NewTicker(24 * time.Hour)
defer checkpointTicker.Stop()
defer integrityTicker.Stop()
defer backupTicker.Stop()
for {
select {
case <-checkpointTicker.C:
// Passive checkpoint - doesn't block writers
if _, err := c.db.Exec("PRAGMA wal_checkpoint(PASSIVE)"); err != nil {
fmt.Printf("WAL checkpoint error: %v\n", err)
}
case <-integrityTicker.C:
// Quick check is faster than full integrity_check
var result string
if err := c.db.QueryRow("PRAGMA quick_check").Scan(&result); err != nil {
fmt.Printf("Integrity check error: %v\n", err)
} else if result != "ok" {
fmt.Printf("WARNING: Database integrity issue detected: %s\n", result)
}
case <-backupTicker.C:
c.createBackup()
}
}
}
// createBackup creates a hot backup of the database using SQLite's backup API
func (c *Crawler) createBackup() {
backupPath := "feeds/feeds.db.backup"
fmt.Println("Creating database backup...")
// Use SQLite's online backup via VACUUM INTO (available in SQLite 3.27+)
// This creates a consistent snapshot without blocking writers
if _, err := c.db.Exec("VACUUM INTO ?", backupPath); err != nil {
fmt.Printf("Backup error: %v\n", err)
return
}
fmt.Printf("Backup created: %s\n", backupPath)
}
// StartCrawlLoop runs the domain crawling loop independently
func (c *Crawler) StartCrawlLoop() {
numWorkers := runtime.NumCPU()
@@ -113,9 +170,9 @@ func (c *Crawler) StartCrawlLoop() {
}()
}
const fetchSize = 100
const fetchSize = 1000
for {
domains, err := c.GetUncheckedDomainsRandom(fetchSize)
domains, err := c.GetUncheckedDomains(fetchSize)
if err != nil {
fmt.Printf("Error fetching domains: %v\n", err)
}
@@ -155,7 +212,7 @@ func (c *Crawler) StartCheckLoop() {
}()
}
const fetchSize = 100
const fetchSize = 1000
for {
feeds, err := c.GetFeedsDueForCheck(fetchSize)
if err != nil {
+1569 -66
View File
File diff suppressed because it is too large Load Diff
+43 -5
View File
@@ -3,6 +3,7 @@ package main
import (
"database/sql"
"fmt"
"time"
_ "modernc.org/sqlite"
)
@@ -25,6 +26,7 @@ CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WH
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
category TEXT DEFAULT 'main',
title TEXT,
description TEXT,
language TEXT,
@@ -56,14 +58,20 @@ CREATE TABLE IF NOT EXISTS feeds (
oldestItemDate DATETIME,
newestItemDate DATETIME,
noUpdate INTEGER DEFAULT 0
noUpdate INTEGER DEFAULT 0,
-- Publishing to PDS
publishStatus TEXT DEFAULT 'held' CHECK(publishStatus IN ('held', 'pass', 'fail')),
publishAccount TEXT
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_publishStatus ON feeds(publishStatus);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
@@ -80,6 +88,17 @@ CREATE TABLE IF NOT EXISTS items (
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
-- Media attachments
enclosureUrl TEXT,
enclosureType TEXT,
enclosureLength INTEGER,
imageUrls TEXT, -- JSON array of image URLs
-- Publishing to PDS
publishedAt DATETIME,
publishedUri TEXT,
UNIQUE(feedUrl, guid)
);
@@ -87,6 +106,7 @@ CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feedUrl, publishedAt) WHERE publishedAt IS NULL;
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
@@ -148,15 +168,22 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
// - busy_timeout: wait up to 10s for locks instead of failing immediately
// - journal_mode: WAL for better concurrency and crash recovery
// - synchronous: NORMAL is safe with WAL (fsync at checkpoint, not every commit)
// - wal_autocheckpoint: checkpoint every 1000 pages (~4MB) to prevent WAL bloat
// - foreign_keys: enforce referential integrity
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=wal_autocheckpoint(1000)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Allow multiple readers (WAL mode supports concurrent reads)
// SQLite is single-writer, but reads can happen concurrently
db.SetMaxOpenConns(4)
// Connection pool settings for stability
db.SetMaxOpenConns(4) // Limit concurrent connections
db.SetMaxIdleConns(2) // Keep some connections warm
db.SetConnMaxLifetime(5 * time.Minute) // Recycle connections periodically
db.SetConnMaxIdleTime(1 * time.Minute) // Close idle connections
// Verify connection and show journal mode
var journalMode string
@@ -173,6 +200,17 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
}
fmt.Println(" Schema OK")
// Migrations for existing databases
migrations := []string{
"ALTER TABLE items ADD COLUMN enclosureUrl TEXT",
"ALTER TABLE items ADD COLUMN enclosureType TEXT",
"ALTER TABLE items ADD COLUMN enclosureLength INTEGER",
"ALTER TABLE items ADD COLUMN imageUrls TEXT",
}
for _, m := range migrations {
db.Exec(m) // Ignore errors (column may already exist)
}
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
+1
View File
@@ -3,6 +3,7 @@ services:
build: .
container_name: app-1440-news
restart: unless-stopped
stop_grace_period: 30s
env_file:
- pds.env
volumes:
+4 -18
View File
@@ -88,26 +88,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
return domain, nil
}
// GetUncheckedDomains returns all domains with status "unchecked"
func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO)
func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
`)
if err != nil {
return nil, err
}
defer rows.Close()
return c.scanDomains(rows)
}
// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
ORDER BY RANDOM()
ORDER BY discoveredAt ASC
LIMIT ?
`, limit)
if err != nil {
@@ -224,7 +210,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
const batchSize = 10000
const batchSize = 1000
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
totalImported := 0
+335 -100
View File
@@ -2,6 +2,7 @@ package main
import (
"database/sql"
"encoding/json"
"fmt"
"io"
"net/http"
@@ -12,58 +13,91 @@ import (
"time"
)
// shouldSkipFeed checks if a feed URL should be filtered out
// Returns true (and a reason) if the feed should be skipped
func shouldSkipFeed(feedURL string) (bool, string) {
// classifyFeed determines the category of a feed based on URL patterns
// Returns: "main", "comments", "category", "author", "article", "podcast"
// Note: podcast detection is also done in parseRSSMetadata based on content
func classifyFeed(feedURL string) string {
lower := strings.ToLower(feedURL)
// Skip explicit comment feeds
// Comment feeds
if strings.Contains(lower, "/comment") {
return true, "comment feed"
return "comments"
}
// Podcast URL patterns
podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
for _, pattern := range podcastPatterns {
if strings.Contains(lower, pattern) {
return "podcast"
}
}
u, err := url.Parse(feedURL)
if err != nil {
return false, ""
return "main"
}
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
// Skip category/tag feeds
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
// Author feeds
if strings.Contains(path, "/author/") {
return "author"
}
// Category/tag feeds
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
for _, pattern := range categoryPatterns {
if strings.Contains(path, pattern) {
return true, "category/tag feed"
return "category"
}
}
// Check for article comment feeds (path ending in /feed with content before it)
// Check for article feeds (path ending in /feed with content before it)
if strings.HasSuffix(path, "/feed") {
basePath := strings.TrimSuffix(path, "/feed")
basePath = strings.Trim(basePath, "/")
if basePath == "" {
return false, "" // Just /feed - legitimate main feed
return "main" // Just /feed - main feed
}
// Skip if path contains date patterns (likely article)
// Article if path contains date patterns
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
return true, "article feed (date pattern)"
return "article"
}
// Skip if path has multiple segments (likely article or nested content)
// Article if path has multiple segments (nested content)
segments := strings.Split(basePath, "/")
if len(segments) >= 2 {
return true, "article feed (nested path)"
return "article"
}
// Skip if single segment looks like an article slug (contains hyphens, is long)
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
return true, "article feed (slug pattern)"
// Article if single segment looks like an article slug
if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
return "article"
}
}
return false, ""
return "main"
}
// classifyFeedByTitle refines category based on feed title (called after parsing)
func classifyFeedByTitle(title string, currentCategory string) string {
if currentCategory != "main" {
return currentCategory // Already classified by URL
}
lower := strings.ToLower(title)
if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
return "comments"
}
return currentCategory
}
// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
URL string `json:"url"`
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
Length int64 `json:"length"` // Size in bytes
}
// Item represents an individual entry/article from a feed
@@ -79,12 +113,21 @@ type Item struct {
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
PublishedUri string `json:"published_uri,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
Type string `json:"type"` // "rss", "atom", or "unknown"
Type string `json:"type"` // "rss", "atom", or "unknown"
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
@@ -124,23 +167,35 @@ type Feed struct {
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
// Publishing to PDS
PublishStatus string `json:"publish_status"` // "held", "pass", "fail"
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
}
// saveFeed stores a feed in SQLite
func (c *Crawler) saveFeed(feed *Feed) error {
// Default publishStatus to "held" if not set
publishStatus := feed.PublishStatus
if publishStatus == "" {
publishStatus = "held"
}
_, err := c.db.Exec(`
INSERT INTO feeds (
url, type, title, description, language, siteUrl,
url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
noUpdate,
publishStatus, publishAccount
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
type = excluded.type,
category = excluded.category,
title = excluded.title,
description = excluded.description,
language = excluded.language,
@@ -161,9 +216,11 @@ func (c *Crawler) saveFeed(feed *Feed) error {
avgPostFreqHrs = excluded.avgPostFreqHrs,
oldestItemDate = excluded.oldestItemDate,
newestItemDate = excluded.newestItemDate,
noUpdate = excluded.noUpdate
noUpdate = excluded.noUpdate,
publishStatus = excluded.publishStatus,
publishAccount = excluded.publishAccount
`,
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description),
nullString(feed.Language), nullString(feed.SiteURL),
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
nullString(feed.ETag), nullString(feed.LastModified),
@@ -172,6 +229,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
feed.NoUpdate,
publishStatus, nullString(feed.PublishAccount),
)
return err
}
@@ -179,23 +237,25 @@ func (c *Crawler) saveFeed(feed *Feed) error {
// getFeed retrieves a feed from SQLite
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var category, title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
var publishStatus, publishAccount sql.NullString
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
noUpdate,
publishStatus, publishAccount
FROM feeds WHERE url = ?
`, normalizeURL(feedURL)).Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
@@ -203,6 +263,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
&publishStatus, &publishAccount,
)
if err == sql.ErrNoRows {
@@ -213,6 +274,11 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
}
// Handle nullable fields
if category.Valid {
feed.Category = category.String
} else {
feed.Category = "main" // Default
}
if title.Valid {
feed.Title = title.String
}
@@ -267,6 +333,14 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
if publishStatus.Valid {
feed.PublishStatus = publishStatus.String
} else {
feed.PublishStatus = "held"
}
if publishAccount.Valid {
feed.PublishAccount = publishAccount.String
}
return feed, nil
}
@@ -281,14 +355,15 @@ func (c *Crawler) feedExists(feedURL string) bool {
// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
noUpdate,
publishStatus, publishAccount
FROM feeds
`)
if err != nil {
@@ -316,14 +391,15 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
noUpdate,
publishStatus, publishAccount
FROM feeds
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
ORDER BY RANDOM()
@@ -340,14 +416,15 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
noUpdate,
publishStatus, publishAccount
FROM feeds WHERE sourceHost = ?
`, host)
if err != nil {
@@ -361,14 +438,15 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl,
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
f.etag, f.lastModified,
f.ttlMinutes, f.updatePeriod, f.updateFreq,
f.status, f.errorCount, f.lastError, f.lastErrorAt,
f.sourceUrl, f.sourceHost, f.tld,
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
f.noUpdate
f.noUpdate,
f.publishEnabled, f.publishAccount
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
@@ -387,13 +465,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
for rows.Next() {
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var category, title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
var publishStatus, publishAccount sql.NullString
if err := rows.Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
@@ -401,11 +480,17 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
&publishStatus, &publishAccount,
); err != nil {
continue
}
// Handle nullable fields
if category.Valid {
feed.Category = category.String
} else {
feed.Category = "main"
}
if title.Valid {
feed.Title = title.String
}
@@ -460,6 +545,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
if publishStatus.Valid {
feed.PublishStatus = publishStatus.String
} else {
feed.PublishStatus = "held"
}
if publishAccount.Valid {
feed.PublishAccount = publishAccount.String
}
feeds = append(feeds, feed)
}
@@ -469,9 +562,27 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
func (c *Crawler) saveItem(item *Item) error {
// Serialize enclosure fields
var enclosureUrl, enclosureType sql.NullString
var enclosureLength sql.NullInt64
if item.Enclosure != nil {
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
}
// Serialize imageUrls as JSON
var imageUrlsJSON sql.NullString
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
}
}
_, err := c.db.Exec(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
enclosureUrl, enclosureType, enclosureLength, imageUrls)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
@@ -479,11 +590,16 @@ func (c *Crawler) saveItem(item *Item) error {
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
updatedAt = excluded.updatedAt,
enclosureUrl = excluded.enclosureUrl,
enclosureType = excluded.enclosureType,
enclosureLength = excluded.enclosureLength,
imageUrls = excluded.imageUrls
`,
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
)
return err
}
@@ -501,8 +617,9 @@ func (c *Crawler) saveItems(items []*Item) error {
defer tx.Rollback()
stmt, err := tx.Prepare(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
enclosureUrl, enclosureType, enclosureLength, imageUrls)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
@@ -510,7 +627,11 @@ func (c *Crawler) saveItems(items []*Item) error {
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
updatedAt = excluded.updatedAt,
enclosureUrl = excluded.enclosureUrl,
enclosureType = excluded.enclosureType,
enclosureLength = excluded.enclosureLength,
imageUrls = excluded.imageUrls
`)
if err != nil {
return err
@@ -521,10 +642,29 @@ func (c *Crawler) saveItems(items []*Item) error {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
// Serialize enclosure fields
var enclosureUrl, enclosureType sql.NullString
var enclosureLength sql.NullInt64
if item.Enclosure != nil {
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
}
// Serialize imageUrls as JSON
var imageUrlsJSON sql.NullString
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
}
}
_, err := stmt.Exec(
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
)
if err != nil {
continue // Skip failed items
@@ -537,7 +677,9 @@ func (c *Crawler) saveItems(items []*Item) error {
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
enclosureUrl, enclosureType, enclosureLength, imageUrls,
publishedAt, publishedUri
FROM items
WHERE feedUrl = ?
ORDER BY pubDate DESC
@@ -548,55 +690,15 @@ func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
return scanItems(rows)
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt,
i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls,
i.publishedAt, i.publishedUri
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
@@ -608,16 +710,27 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
}
defer rows.Close()
return scanItems(rows)
}
// scanItems is a helper to scan multiple item rows
func scanItems(rows *sql.Rows) ([]*Item, error) {
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
var pubDate, updatedAt, publishedAt sql.NullTime
var enclosureUrl, enclosureType sql.NullString
var enclosureLength sql.NullInt64
var imageUrlsJSON sql.NullString
var publishedUri sql.NullString
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
&publishedAt, &publishedUri,
); err != nil {
continue
}
@@ -647,6 +760,32 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
item.UpdatedAt = updatedAt.Time
}
// Parse enclosure
if enclosureUrl.Valid && enclosureUrl.String != "" {
item.Enclosure = &Enclosure{
URL: enclosureUrl.String,
Type: enclosureType.String,
}
if enclosureLength.Valid {
item.Enclosure.Length = enclosureLength.Int64
}
}
// Parse imageUrls JSON
if imageUrlsJSON.Valid && imageUrlsJSON.String != "" {
var urls []string
if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil {
item.ImageURLs = urls
}
}
if publishedAt.Valid {
item.PublishedAt = publishedAt.Time
}
if publishedUri.Valid {
item.PublishedUri = publishedUri.String
}
items = append(items, item)
}
@@ -667,10 +806,6 @@ func (c *Crawler) CleanupOldItems() (int64, error) {
// processFeed parses and stores a feed with full metadata
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
if strings.Contains(feedURL, "/comment") {
return
}
// Fast path: check without lock
if c.feedExists(feedURL) {
return
@@ -690,6 +825,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
@@ -708,6 +844,9 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
items = c.parseAtomMetadata(body, feed)
}
// Refine category based on parsed title (e.g., "Comments on:")
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
// Calculate next crawl time
feed.NextCrawlAt = c.calculateNextCrawl(feed)
@@ -723,11 +862,6 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
// Skip comment, category, and article feeds
if skip, _ := shouldSkipFeed(feedURL); skip {
return
}
// Fast path: check without lock
if c.feedExists(feedURL) {
return
@@ -746,6 +880,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
feed := &Feed{
URL: normalizedURL,
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
Status: "active",
SourceURL: normalizeURL(sourceURL),
@@ -896,3 +1031,103 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
return true, nil
}
// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail')
// If status is 'pass', the account handle is also set (auto-derived if empty)
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
feedURL = normalizeURL(feedURL)
// Auto-derive account if passing and not provided
if status == "pass" && account == "" {
account = DeriveHandleFromFeed(feedURL)
}
_, err := c.db.Exec(`
UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ?
`, status, nullString(account), feedURL)
return err
}
// GetFeedsByPublishStatus returns all feeds with a specific publish status
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate,
publishStatus, publishAccount
FROM feeds
WHERE publishStatus = ?
`, status)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetPublishCandidates returns feeds that are held review and have items
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate,
publishStatus, publishAccount
FROM feeds
WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active'
ORDER BY itemCount DESC
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetUnpublishedItems returns items for a feed that haven't been published yet
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
enclosureUrl, enclosureType, enclosureLength, imageUrls,
publishedAt, publishedUri
FROM items
WHERE feedUrl = ? AND publishedAt IS NULL
ORDER BY pubDate ASC
LIMIT ?
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// MarkItemPublished marks an item as published with the given URI
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
_, err := c.db.Exec(`
UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ?
`, uri, itemID)
return err
}
// GetUnpublishedItemCount returns the count of unpublished items for a feed
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
var count int
err := c.db.QueryRow(`
SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL
`, feedURL).Scan(&count)
return count, err
}
+24 -4
View File
@@ -3,6 +3,8 @@ package main
import (
"fmt"
"os"
"os/signal"
"syscall"
)
func main() {
@@ -17,7 +19,10 @@ func main() {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1)
}
defer crawler.Close()
// Setup graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Start dashboard in background
go func() {
@@ -41,9 +46,24 @@ func main() {
// Stats loop (background) - updates once per minute
go crawler.StartStatsLoop()
// Cleanup loop (background) - removes old items once per hour
// Cleanup loop (background) - removes old items once per week
go crawler.StartCleanupLoop()
// Crawl loop (foreground - blocks forever)
crawler.StartCrawlLoop()
// Maintenance loop (background) - WAL checkpoints and integrity checks
go crawler.StartMaintenanceLoop()
// Crawl loop (background)
go crawler.StartCrawlLoop()
// Wait for shutdown signal
sig := <-sigChan
fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig)
// Close crawler (checkpoints WAL and closes database)
if err := crawler.Close(); err != nil {
fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err)
os.Exit(1)
}
fmt.Println("Shutdown complete")
}
+184 -8
View File
@@ -3,6 +3,7 @@ package main
import (
"encoding/xml"
"fmt"
"regexp"
"strings"
"time"
)
@@ -23,17 +24,52 @@ type RSSChannel struct {
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
Items []RSSItem `xml:"item"`
// iTunes podcast namespace
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
}
type RSSItem struct {
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
Description string `xml:"description"`
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Enclosure *RSSEnclosure `xml:"enclosure"`
// iTunes item elements
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
// Media RSS elements
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
}
// MediaContent represents a media:content element
type MediaContent struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Medium string `xml:"medium,attr"` // image, video, audio
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
// MediaThumbnail represents a media:thumbnail element
type MediaThumbnail struct {
URL string `xml:"url,attr"`
Width int `xml:"width,attr"`
Height int `xml:"height,attr"`
}
type RSSEnclosure struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
Length int64 `xml:"length,attr"`
}
// Atom structs for parsing
@@ -70,6 +106,43 @@ type AtomLink struct {
Type string `xml:"type,attr"`
}
// isPodcast checks if an RSS feed is a podcast based on content
func isPodcast(ch RSSChannel) bool {
// Check for iTunes namespace elements at channel level
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
ch.ITunesExplicit != "" || ch.ITunesType != "" {
return true
}
// Check items for audio enclosures or iTunes elements
audioCount := 0
for _, item := range ch.Items {
// Check for iTunes duration or episode number
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
return true
}
// Check for audio/video enclosure
if item.Enclosure != nil && item.Enclosure.URL != "" {
mimeType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(mimeType, "audio/") ||
strings.HasPrefix(mimeType, "video/") ||
strings.Contains(mimeType, "mpeg") ||
strings.Contains(mimeType, "mp3") ||
strings.Contains(mimeType, "mp4") ||
strings.Contains(mimeType, "m4a") ||
strings.Contains(mimeType, "ogg") {
audioCount++
}
}
}
// If more than half the items have audio enclosures, it's a podcast
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
return true
}
return false
}
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
ch := rss.Channel
feed.Title = ch.Title
feed.Description = ch.Description
feed.Language = ch.Language
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
// Detect podcast
if isPodcast(ch) {
feed.Category = "podcast"
}
// Parse lastBuildDate
if ch.LastBuildDate != "" {
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
}
// Map enclosure
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
item.Enclosure = &Enclosure{
URL: rssItem.Enclosure.URL,
Type: rssItem.Enclosure.Type,
Length: rssItem.Enclosure.Length,
}
}
// Extract images from various sources
item.ImageURLs = extractItemImages(rssItem)
items = append(items, item)
}
@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
}
// extractItemImages extracts image URLs from an RSS item
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
func extractItemImages(rssItem RSSItem) []string {
seen := make(map[string]bool)
var images []string
addImage := func(url string) {
url = strings.TrimSpace(url)
if url == "" || seen[url] {
return
}
// Basic validation
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
return
}
seen[url] = true
images = append(images, url)
}
// 1. Media RSS content (prefer larger images)
for _, mc := range rssItem.MediaContent {
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
addImage(mc.URL)
}
}
// 2. Media RSS thumbnails
for _, mt := range rssItem.MediaThumbnail {
if mt.URL != "" {
addImage(mt.URL)
}
}
// 3. iTunes image
if rssItem.ITunesImage != "" {
addImage(rssItem.ITunesImage)
}
// 4. Image enclosure
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
addImage(rssItem.Enclosure.URL)
}
// 5. Extract <img> tags from description and content
htmlImages := extractImgTags(rssItem.Description)
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
for _, img := range htmlImages {
addImage(img)
}
return images
}
// extractImgTags extracts src URLs from <img> tags in HTML
func extractImgTags(html string) []string {
if html == "" {
return nil
}
var urls []string
// Simple regex to find img src attributes
// Matches: src="..." or src='...'
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
matches := imgRegex.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) > 1 {
url := strings.TrimSpace(match[1])
// Skip data URIs, tracking pixels, and tiny images
if strings.HasPrefix(url, "data:") {
continue
}
// Skip common tracking/spacer images
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
continue
}
urls = append(urls, url)
}
}
return urls
}
+909
View File
@@ -0,0 +1,909 @@
package main
import (
"bytes"
"crypto/sha256"
"encoding/base32"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
)
// Publisher handles posting items to AT Protocol PDS
type Publisher struct {
pdsHost string
httpClient *http.Client
}
// PDSSession holds authentication info for a PDS account
type PDSSession struct {
DID string `json:"did"`
Handle string `json:"handle"`
AccessJwt string `json:"accessJwt"`
RefreshJwt string `json:"refreshJwt"`
}
// BskyPost represents an app.bsky.feed.post record
type BskyPost struct {
Type string `json:"$type"`
Text string `json:"text"`
CreatedAt string `json:"createdAt"`
Facets []BskyFacet `json:"facets,omitempty"`
Embed *BskyEmbed `json:"embed,omitempty"`
}
type BskyFacet struct {
Index BskyByteSlice `json:"index"`
Features []BskyFeature `json:"features"`
}
type BskyByteSlice struct {
ByteStart int `json:"byteStart"`
ByteEnd int `json:"byteEnd"`
}
type BskyFeature struct {
Type string `json:"$type"`
URI string `json:"uri,omitempty"`
}
type BskyEmbed struct {
Type string `json:"$type"`
External *BskyExternal `json:"external,omitempty"`
Images []BskyImage `json:"images,omitempty"`
}
type BskyExternal struct {
URI string `json:"uri"`
Title string `json:"title"`
Description string `json:"description"`
Thumb *BlobRef `json:"thumb,omitempty"`
}
type BskyImage struct {
Alt string `json:"alt"`
Image *BlobRef `json:"image"`
}
// NewPublisher creates a new Publisher instance
func NewPublisher(pdsHost string) *Publisher {
return &Publisher{
pdsHost: pdsHost,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// CreateSession authenticates with the PDS and returns a session
func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
payload := map[string]string{
"identifier": handle,
"password": password,
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createSession",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
return nil, err
}
return &session, nil
}
// CreateAccount creates a new account on the PDS
// Requires an invite code if the PDS has invites enabled
func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
payload := map[string]interface{}{
"handle": handle,
"email": email,
"password": password,
}
if inviteCode != "" {
payload["inviteCode"] = inviteCode
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createAccount",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.Unmarshal(respBody, &session); err != nil {
return nil, err
}
return &session, nil
}
// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
payload := map[string]interface{}{
"useCount": useCount,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
// PDS admin APIs use Basic Auth with "admin" as username
req.SetBasicAuth("admin", adminPassword)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Code string `json:"code"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.Code, nil
}
// GenerateRkey creates a deterministic rkey from a GUID and timestamp
// Uses a truncated base32-encoded SHA256 hash
// Including the timestamp allows regenerating a new rkey by updating discoveredAt
func GenerateRkey(guid string, timestamp time.Time) string {
if guid == "" {
return ""
}
// Combine GUID with timestamp for the hash input
// Format timestamp to second precision for consistency
input := guid + "|" + timestamp.UTC().Format(time.RFC3339)
hash := sha256.Sum256([]byte(input))
// Use first 10 bytes (80 bits) - plenty for uniqueness
// Base32 encode without padding, lowercase for rkey compatibility
encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(hash[:10])
return strings.ToLower(encoded)
}
// extractURLs finds all URLs in a string
func extractURLs(text string) []string {
// Match http:// or https:// URLs
urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`)
matches := urlRegex.FindAllString(text, -1)
// Clean up trailing punctuation
var urls []string
for _, u := range matches {
// Remove trailing punctuation that's likely not part of the URL
u = strings.TrimRight(u, ".,;:!?")
if u != "" {
urls = append(urls, u)
}
}
return urls
}
// PublishItem posts a feed item to the PDS
// Returns the AT URI of the created record, or error
func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
if item.GUID == "" && item.Link == "" {
return "", fmt.Errorf("item has no GUID or link, cannot publish")
}
// Collect all unique URLs: main link + any URLs in description
urlSet := make(map[string]bool)
var allURLs []string
// Add main link first
if item.Link != "" {
urlSet[item.Link] = true
allURLs = append(allURLs, item.Link)
}
// Add enclosure URL for podcasts/media (audio/video)
if item.Enclosure != nil && item.Enclosure.URL != "" {
encType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") {
if !urlSet[item.Enclosure.URL] {
urlSet[item.Enclosure.URL] = true
allURLs = append(allURLs, item.Enclosure.URL)
}
}
}
// Extract URLs from description
descURLs := extractURLs(item.Description)
for _, u := range descURLs {
if !urlSet[u] {
urlSet[u] = true
allURLs = append(allURLs, u)
}
}
// Extract URLs from content if available
contentURLs := extractURLs(item.Content)
for _, u := range contentURLs {
if !urlSet[u] {
urlSet[u] = true
allURLs = append(allURLs, u)
}
}
// Build post text: title + all links
// Bluesky has 300 grapheme limit
var textBuilder strings.Builder
textBuilder.WriteString(item.Title)
for _, u := range allURLs {
textBuilder.WriteString("\n\n")
textBuilder.WriteString(u)
}
text := textBuilder.String()
// Truncate title if text is too long (keep URLs intact)
const maxLen = 300
if len(text) > maxLen {
// Calculate space needed for URLs
urlSpace := 0
for _, u := range allURLs {
urlSpace += len(u) + 2 // +2 for \n\n
}
maxTitleLen := maxLen - urlSpace - 3 // -3 for "..."
if maxTitleLen > 10 {
text = item.Title[:maxTitleLen] + "..."
for _, u := range allURLs {
text += "\n\n" + u
}
}
}
// Use item's pubDate for createdAt, fall back to now
createdAt := time.Now()
if !item.PubDate.IsZero() {
createdAt = item.PubDate
}
post := BskyPost{
Type: "app.bsky.feed.post",
Text: text,
CreatedAt: createdAt.Format(time.RFC3339),
}
// Add facets for all URLs
for _, u := range allURLs {
linkStart := strings.Index(text, u)
if linkStart >= 0 {
// Use byte positions (for UTF-8 this matters)
byteStart := len(text[:linkStart])
byteEnd := byteStart + len(u)
post.Facets = append(post.Facets, BskyFacet{
Index: BskyByteSlice{
ByteStart: byteStart,
ByteEnd: byteEnd,
},
Features: []BskyFeature{
{
Type: "app.bsky.richtext.facet#link",
URI: u,
},
},
})
}
}
// Decide embed type based on content
// Priority: images > external link card
if len(item.ImageURLs) > 0 {
// Try to upload images (up to 4)
uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
if len(uploadedImages) > 0 {
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.images",
Images: uploadedImages,
}
}
}
// Fall back to external embed if no images were uploaded
if post.Embed == nil && len(allURLs) > 0 {
external := &BskyExternal{
URI: allURLs[0],
Title: item.Title,
Description: truncate(stripHTML(item.Description), 300),
}
// Try to add thumbnail from first image
if len(item.ImageURLs) > 0 {
if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
external.Thumb = thumb
}
}
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.external",
External: external,
}
}
// Use GUID + discoveredAt for deterministic rkey
// This allows regenerating a new rkey by updating discoveredAt if needed
guidForRkey := item.GUID
if guidForRkey == "" {
guidForRkey = item.Link
}
rkey := GenerateRkey(guidForRkey, item.DiscoveredAt)
// Create the record with deterministic rkey
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.feed.post",
"rkey": rkey,
"record": post,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
URI string `json:"uri"`
CID string `json:"cid"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.URI, nil
}
// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
var images []BskyImage
maxImages := 4
if len(imageURLs) < maxImages {
maxImages = len(imageURLs)
}
for i := 0; i < maxImages; i++ {
blob := p.fetchAndUploadImage(session, imageURLs[i])
if blob != nil {
images = append(images, BskyImage{
Alt: altText,
Image: blob,
})
}
}
return images
}
// fetchAndUploadImage downloads an image and uploads it to the PDS
func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
// Fetch the image
resp, err := p.httpClient.Get(imageURL)
if err != nil {
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil
}
// Check content type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
// Try to guess from URL
if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
contentType = "image/png"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
contentType = "image/gif"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
contentType = "image/webp"
} else {
contentType = "image/jpeg" // Default
}
}
// Only accept image types
if !strings.HasPrefix(contentType, "image/") {
return nil
}
// Read image data (limit to 1MB to avoid issues)
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
if err != nil || len(data) == 0 {
return nil
}
// Upload to PDS
blob, err := p.UploadBlob(session, data, contentType)
if err != nil {
return nil
}
return blob
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen-3] + "..."
}
// stripHTML removes HTML tags from a string
func stripHTML(s string) string {
// Remove HTML tags
tagRegex := regexp.MustCompile(`<[^>]*>`)
s = tagRegex.ReplaceAllString(s, "")
// Decode common HTML entities
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", "\"")
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
// Collapse whitespace
spaceRegex := regexp.MustCompile(`\s+`)
s = spaceRegex.ReplaceAllString(s, " ")
return strings.TrimSpace(s)
}
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {combined-path-and-hostname}.1440.news
// The PDS limits subdomains to 18 characters, so we prioritize meaningful parts
// Example: news.ycombinator.com/showrss → show-ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18
// Ensure we have a scheme for parsing
if !strings.Contains(feedURL, "://") {
feedURL = "https://" + feedURL
}
u, err := url.Parse(feedURL)
if err != nil {
return ""
}
hostname := strings.ToLower(u.Hostname())
path := strings.ToLower(u.Path)
// Remove common feed suffixes/extensions
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
for _, suffix := range suffixesToRemove {
path = strings.TrimSuffix(path, suffix)
}
// Split path into segments
segments := strings.Split(strings.Trim(path, "/"), "/")
// Filter out common feed-related words
skipWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true, // NYTimes uses /services/xml/rss/nyt/
}
var pathParts []string
for _, seg := range segments {
seg = cleanHandleSegment(seg)
if seg != "" && !skipWords[seg] {
pathParts = append(pathParts, seg)
}
}
// Split hostname into parts, drop common TLDs to save space
hostParts := strings.Split(hostname, ".")
commonTLDs := map[string]bool{
"com": true, "org": true, "net": true, "io": true, "co": true,
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
}
// Remove TLD if it's common (to save characters)
if len(hostParts) > 1 && commonTLDs[hostParts[len(hostParts)-1]] {
hostParts = hostParts[:len(hostParts)-1]
}
// Build subdomain: path parts first (they differentiate feeds), then host parts
// Priority order for fitting in 18 chars:
// 1. Main hostname part (e.g., "ycombinator")
// 2. Path prefix (e.g., "show")
// 3. Hostname subdomain (e.g., "news")
var subdomain string
// Start with the main hostname (usually the second-to-last part, or first if only one)
mainHost := hostParts[len(hostParts)-1]
if len(hostParts) > 1 {
mainHost = hostParts[len(hostParts)-1] // e.g., "ycombinator" from "news.ycombinator"
}
// If path parts exist, prepend them
if len(pathParts) > 0 {
subdomain = pathParts[0] + "-" + mainHost
} else if len(hostParts) > 1 {
// No path, use subdomain-hostname (e.g., "news-ycombinator")
subdomain = hostParts[0] + "-" + mainHost
} else {
subdomain = mainHost
}
// If still too long, just use main hostname
if len(subdomain) > maxSubdomainLen {
subdomain = mainHost
}
// Final safety: truncate if still too long
if len(subdomain) > maxSubdomainLen {
subdomain = subdomain[:maxSubdomainLen]
}
subdomain = strings.Trim(subdomain, "-")
// Collapse multiple hyphens
for strings.Contains(subdomain, "--") {
subdomain = strings.ReplaceAll(subdomain, "--", "-")
}
return subdomain + ".1440.news"
}
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
// Remove file extensions
if idx := strings.LastIndex(s, "."); idx > 0 {
s = s[:idx]
}
// Convert to lowercase
s = strings.ToLower(s)
// Strip common feed prefixes/suffixes from the segment itself
// e.g., "showrss" → "show", "rssworld" → "world"
feedAffixes := []string{"rss", "feed", "atom", "xml"}
for _, affix := range feedAffixes {
// Strip suffix (e.g., "showrss" → "show")
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
s = strings.TrimSuffix(s, affix)
break
}
// Strip prefix (e.g., "rssworld" → "world")
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
s = strings.TrimPrefix(s, affix)
break
}
}
// Replace underscores and other separators with hyphens
s = strings.ReplaceAll(s, "_", "-")
s = strings.ReplaceAll(s, " ", "-")
// Remove any characters that aren't alphanumeric or hyphens
reg := regexp.MustCompile(`[^a-z0-9-]`)
s = reg.ReplaceAllString(s, "")
// Collapse multiple hyphens
for strings.Contains(s, "--") {
s = strings.ReplaceAll(s, "--", "-")
}
// Trim leading/trailing hyphens
s = strings.Trim(s, "-")
return s
}
// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
// Remove .1440.news suffix
handle = strings.TrimSuffix(handle, ".1440.news")
parts := strings.Split(handle, ".")
// Try to find where hostname starts by looking for valid hostname patterns
if len(parts) >= 2 {
for i := 0; i < len(parts)-1; i++ {
remaining := strings.Join(parts[i:], ".")
if looksLikeHostname(remaining) {
if i > 0 {
prefix = strings.Join(parts[:i], ".")
}
hostname = remaining
return
}
}
}
// Fallback: no prefix, entire thing is hostname
hostname = handle
return "", hostname
}
func isLikelyTLDPart(s string) bool {
tlds := map[string]bool{
"com": true, "org": true, "net": true, "edu": true, "gov": true,
"io": true, "co": true, "uk": true, "de": true, "fr": true,
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
"news": true, "blog": true, "tech": true, "dev": true,
}
return tlds[s]
}
func isTwoPartTLD(first, second string) bool {
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
"org.uk": true, "net.au": true, "com.br": true,
}
return twoPartTLDs[first+"."+second]
}
func looksLikeHostname(s string) bool {
// A hostname typically has at least one dot and ends with a TLD-like part
parts := strings.Split(s, ".")
if len(parts) < 2 {
return false
}
lastPart := parts[len(parts)-1]
return isLikelyTLDPart(lastPart)
}
// BlobRef represents a blob reference for profile images
type BlobRef struct {
Type string `json:"$type"`
Ref Link `json:"ref"`
MimeType string `json:"mimeType"`
Size int64 `json:"size"`
}
type Link struct {
Link string `json:"$link"`
}
// UploadBlob uploads an image to the PDS and returns a blob reference
func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", mimeType)
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Blob BlobRef `json:"blob"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, err
}
return &result.Blob, nil
}
// UpdateProfile updates the profile for an account
func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
// First, get the current profile to preserve any existing fields
getReq, err := http.NewRequest("GET",
p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
nil)
if err != nil {
return err
}
getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
getResp, err := p.httpClient.Do(getReq)
var existingCID string
profile := map[string]interface{}{
"$type": "app.bsky.actor.profile",
}
if err == nil && getResp.StatusCode == http.StatusOK {
defer getResp.Body.Close()
var existing struct {
CID string `json:"cid"`
Value map[string]interface{} `json:"value"`
}
if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
existingCID = existing.CID
profile = existing.Value
}
} else if getResp != nil {
getResp.Body.Close()
}
// Update fields
if displayName != "" {
profile["displayName"] = displayName
}
if description != "" {
profile["description"] = description
}
if avatar != nil {
profile["avatar"] = avatar
}
// Put the record
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.actor.profile",
"rkey": "self",
"record": profile,
}
if existingCID != "" {
payload["swapRecord"] = existingCID
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// FetchFavicon downloads a favicon/icon from a URL
func FetchFavicon(siteURL string) ([]byte, string, error) {
// Try common favicon locations
if !strings.HasPrefix(siteURL, "http") {
siteURL = "https://" + siteURL
}
u, err := url.Parse(siteURL)
if err != nil {
return nil, "", err
}
baseURL := u.Scheme + "://" + u.Host
// Try apple-touch-icon first (usually higher quality)
iconURLs := []string{
baseURL + "/apple-touch-icon.png",
baseURL + "/apple-touch-icon-precomposed.png",
baseURL + "/favicon.png",
baseURL + "/favicon.ico",
}
client := &http.Client{Timeout: 10 * time.Second}
for _, iconURL := range iconURLs {
resp, err := client.Get(iconURL)
if err != nil {
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
continue
}
data, err := io.ReadAll(resp.Body)
if err != nil {
continue
}
// Determine mime type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
if strings.HasSuffix(iconURL, ".png") {
contentType = "image/png"
} else if strings.HasSuffix(iconURL, ".ico") {
contentType = "image/x-icon"
} else {
contentType = "image/png" // default
}
}
return data, contentType, nil
}
return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
}
+28
View File
@@ -53,3 +53,31 @@ td { font-size: 13px; color: #ffffff; }
#searchInput::placeholder { color: #555; }
.search-host { margin-bottom: 10px; }
.search-feed:hover { background: #1a1a1a; }
/* Command buttons */
.cmd-btn {
background: #1a1a1a;
border: 1px solid #333;
border-radius: 4px;
color: #0af;
padding: 6px 12px;
margin-right: 8px;
margin-bottom: 4px;
font-size: 13px;
font-family: monospace;
cursor: pointer;
transition: background 0.2s, border-color 0.2s;
}
.cmd-btn:hover {
background: #252525;
border-color: #0af;
}
.cmd-btn:active {
background: #0af;
color: #000;
}
/* Visit link */
.visit-link:hover {
color: #0af !important;
}
+566 -431
View File
File diff suppressed because it is too large Load Diff