Add AT Protocol publishing, media support, and SQLite stability
Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+60
-3
@@ -61,6 +61,13 @@ func NewCrawler(dbPath string) (*Crawler, error) {
|
||||
|
||||
func (c *Crawler) Close() error {
|
||||
if c.db != nil {
|
||||
// Checkpoint WAL to merge it back into main database before closing
|
||||
// This prevents corruption if the container is stopped mid-write
|
||||
fmt.Println("Checkpointing WAL...")
|
||||
if _, err := c.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)"); err != nil {
|
||||
fmt.Printf("WAL checkpoint warning: %v\n", err)
|
||||
}
|
||||
fmt.Println("Closing database...")
|
||||
return c.db.Close()
|
||||
}
|
||||
return nil
|
||||
@@ -87,6 +94,56 @@ func (c *Crawler) StartCleanupLoop() {
|
||||
}
|
||||
}
|
||||
|
||||
// StartMaintenanceLoop performs periodic database maintenance
|
||||
// - WAL checkpoint every 5 minutes to prevent WAL bloat and reduce corruption risk
|
||||
// - Quick integrity check every hour to detect issues early
|
||||
// - Hot backup every 24 hours for recovery
|
||||
func (c *Crawler) StartMaintenanceLoop() {
|
||||
checkpointTicker := time.NewTicker(5 * time.Minute)
|
||||
integrityTicker := time.NewTicker(1 * time.Hour)
|
||||
backupTicker := time.NewTicker(24 * time.Hour)
|
||||
defer checkpointTicker.Stop()
|
||||
defer integrityTicker.Stop()
|
||||
defer backupTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-checkpointTicker.C:
|
||||
// Passive checkpoint - doesn't block writers
|
||||
if _, err := c.db.Exec("PRAGMA wal_checkpoint(PASSIVE)"); err != nil {
|
||||
fmt.Printf("WAL checkpoint error: %v\n", err)
|
||||
}
|
||||
|
||||
case <-integrityTicker.C:
|
||||
// Quick check is faster than full integrity_check
|
||||
var result string
|
||||
if err := c.db.QueryRow("PRAGMA quick_check").Scan(&result); err != nil {
|
||||
fmt.Printf("Integrity check error: %v\n", err)
|
||||
} else if result != "ok" {
|
||||
fmt.Printf("WARNING: Database integrity issue detected: %s\n", result)
|
||||
}
|
||||
|
||||
case <-backupTicker.C:
|
||||
c.createBackup()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// createBackup creates a hot backup of the database using SQLite's backup API
|
||||
func (c *Crawler) createBackup() {
|
||||
backupPath := "feeds/feeds.db.backup"
|
||||
fmt.Println("Creating database backup...")
|
||||
|
||||
// Use SQLite's online backup via VACUUM INTO (available in SQLite 3.27+)
|
||||
// This creates a consistent snapshot without blocking writers
|
||||
if _, err := c.db.Exec("VACUUM INTO ?", backupPath); err != nil {
|
||||
fmt.Printf("Backup error: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("Backup created: %s\n", backupPath)
|
||||
}
|
||||
|
||||
// StartCrawlLoop runs the domain crawling loop independently
|
||||
func (c *Crawler) StartCrawlLoop() {
|
||||
numWorkers := runtime.NumCPU()
|
||||
@@ -113,9 +170,9 @@ func (c *Crawler) StartCrawlLoop() {
|
||||
}()
|
||||
}
|
||||
|
||||
const fetchSize = 100
|
||||
const fetchSize = 1000
|
||||
for {
|
||||
domains, err := c.GetUncheckedDomainsRandom(fetchSize)
|
||||
domains, err := c.GetUncheckedDomains(fetchSize)
|
||||
if err != nil {
|
||||
fmt.Printf("Error fetching domains: %v\n", err)
|
||||
}
|
||||
@@ -155,7 +212,7 @@ func (c *Crawler) StartCheckLoop() {
|
||||
}()
|
||||
}
|
||||
|
||||
const fetchSize = 100
|
||||
const fetchSize = 1000
|
||||
for {
|
||||
feeds, err := c.GetFeedsDueForCheck(fetchSize)
|
||||
if err != nil {
|
||||
|
||||
+1569
-66
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
@@ -25,6 +26,7 @@ CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WH
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
url TEXT PRIMARY KEY,
|
||||
type TEXT,
|
||||
category TEXT DEFAULT 'main',
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
language TEXT,
|
||||
@@ -56,14 +58,20 @@ CREATE TABLE IF NOT EXISTS feeds (
|
||||
oldestItemDate DATETIME,
|
||||
newestItemDate DATETIME,
|
||||
|
||||
noUpdate INTEGER DEFAULT 0
|
||||
noUpdate INTEGER DEFAULT 0,
|
||||
|
||||
-- Publishing to PDS
|
||||
publishStatus TEXT DEFAULT 'held' CHECK(publishStatus IN ('held', 'pass', 'fail')),
|
||||
publishAccount TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_publishStatus ON feeds(publishStatus);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
|
||||
@@ -80,6 +88,17 @@ CREATE TABLE IF NOT EXISTS items (
|
||||
pubDate DATETIME,
|
||||
discoveredAt DATETIME NOT NULL,
|
||||
updatedAt DATETIME,
|
||||
|
||||
-- Media attachments
|
||||
enclosureUrl TEXT,
|
||||
enclosureType TEXT,
|
||||
enclosureLength INTEGER,
|
||||
imageUrls TEXT, -- JSON array of image URLs
|
||||
|
||||
-- Publishing to PDS
|
||||
publishedAt DATETIME,
|
||||
publishedUri TEXT,
|
||||
|
||||
UNIQUE(feedUrl, guid)
|
||||
);
|
||||
|
||||
@@ -87,6 +106,7 @@ CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feedUrl, publishedAt) WHERE publishedAt IS NULL;
|
||||
|
||||
-- Full-text search for feeds
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
|
||||
@@ -148,15 +168,22 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
|
||||
fmt.Printf("Opening database: %s\n", dbPath)
|
||||
|
||||
// Use pragmas in connection string for consistent application
|
||||
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
|
||||
// - busy_timeout: wait up to 10s for locks instead of failing immediately
|
||||
// - journal_mode: WAL for better concurrency and crash recovery
|
||||
// - synchronous: NORMAL is safe with WAL (fsync at checkpoint, not every commit)
|
||||
// - wal_autocheckpoint: checkpoint every 1000 pages (~4MB) to prevent WAL bloat
|
||||
// - foreign_keys: enforce referential integrity
|
||||
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=wal_autocheckpoint(1000)&_pragma=foreign_keys(ON)"
|
||||
db, err := sql.Open("sqlite", connStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open database: %v", err)
|
||||
}
|
||||
|
||||
// Allow multiple readers (WAL mode supports concurrent reads)
|
||||
// SQLite is single-writer, but reads can happen concurrently
|
||||
db.SetMaxOpenConns(4)
|
||||
// Connection pool settings for stability
|
||||
db.SetMaxOpenConns(4) // Limit concurrent connections
|
||||
db.SetMaxIdleConns(2) // Keep some connections warm
|
||||
db.SetConnMaxLifetime(5 * time.Minute) // Recycle connections periodically
|
||||
db.SetConnMaxIdleTime(1 * time.Minute) // Close idle connections
|
||||
|
||||
// Verify connection and show journal mode
|
||||
var journalMode string
|
||||
@@ -173,6 +200,17 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
|
||||
}
|
||||
fmt.Println(" Schema OK")
|
||||
|
||||
// Migrations for existing databases
|
||||
migrations := []string{
|
||||
"ALTER TABLE items ADD COLUMN enclosureUrl TEXT",
|
||||
"ALTER TABLE items ADD COLUMN enclosureType TEXT",
|
||||
"ALTER TABLE items ADD COLUMN enclosureLength INTEGER",
|
||||
"ALTER TABLE items ADD COLUMN imageUrls TEXT",
|
||||
}
|
||||
for _, m := range migrations {
|
||||
db.Exec(m) // Ignore errors (column may already exist)
|
||||
}
|
||||
|
||||
// Run stats and ANALYZE in background to avoid blocking startup with large databases
|
||||
go func() {
|
||||
var domainCount, feedCount int
|
||||
|
||||
@@ -3,6 +3,7 @@ services:
|
||||
build: .
|
||||
container_name: app-1440-news
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 30s
|
||||
env_file:
|
||||
- pds.env
|
||||
volumes:
|
||||
|
||||
@@ -88,26 +88,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||
return domain, nil
|
||||
}
|
||||
|
||||
// GetUncheckedDomains returns all domains with status "unchecked"
|
||||
func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
|
||||
// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO)
|
||||
func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||
FROM domains WHERE status = 'unchecked'
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return c.scanDomains(rows)
|
||||
}
|
||||
|
||||
// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
|
||||
func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||
FROM domains WHERE status = 'unchecked'
|
||||
ORDER BY RANDOM()
|
||||
ORDER BY discoveredAt ASC
|
||||
LIMIT ?
|
||||
`, limit)
|
||||
if err != nil {
|
||||
@@ -224,7 +210,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
buf := make([]byte, 0, 64*1024)
|
||||
scanner.Buffer(buf, 1024*1024)
|
||||
|
||||
const batchSize = 10000
|
||||
const batchSize = 1000
|
||||
now := time.Now()
|
||||
nowStr := now.Format("2006-01-02 15:04:05")
|
||||
totalImported := 0
|
||||
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -12,58 +13,91 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// shouldSkipFeed checks if a feed URL should be filtered out
|
||||
// Returns true (and a reason) if the feed should be skipped
|
||||
func shouldSkipFeed(feedURL string) (bool, string) {
|
||||
// classifyFeed determines the category of a feed based on URL patterns
|
||||
// Returns: "main", "comments", "category", "author", "article", "podcast"
|
||||
// Note: podcast detection is also done in parseRSSMetadata based on content
|
||||
func classifyFeed(feedURL string) string {
|
||||
lower := strings.ToLower(feedURL)
|
||||
|
||||
// Skip explicit comment feeds
|
||||
// Comment feeds
|
||||
if strings.Contains(lower, "/comment") {
|
||||
return true, "comment feed"
|
||||
return "comments"
|
||||
}
|
||||
|
||||
// Podcast URL patterns
|
||||
podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
|
||||
for _, pattern := range podcastPatterns {
|
||||
if strings.Contains(lower, pattern) {
|
||||
return "podcast"
|
||||
}
|
||||
}
|
||||
|
||||
u, err := url.Parse(feedURL)
|
||||
if err != nil {
|
||||
return false, ""
|
||||
return "main"
|
||||
}
|
||||
|
||||
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
||||
|
||||
// Skip category/tag feeds
|
||||
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
|
||||
// Author feeds
|
||||
if strings.Contains(path, "/author/") {
|
||||
return "author"
|
||||
}
|
||||
|
||||
// Category/tag feeds
|
||||
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
|
||||
for _, pattern := range categoryPatterns {
|
||||
if strings.Contains(path, pattern) {
|
||||
return true, "category/tag feed"
|
||||
return "category"
|
||||
}
|
||||
}
|
||||
|
||||
// Check for article comment feeds (path ending in /feed with content before it)
|
||||
// Check for article feeds (path ending in /feed with content before it)
|
||||
if strings.HasSuffix(path, "/feed") {
|
||||
basePath := strings.TrimSuffix(path, "/feed")
|
||||
basePath = strings.Trim(basePath, "/")
|
||||
|
||||
if basePath == "" {
|
||||
return false, "" // Just /feed - legitimate main feed
|
||||
return "main" // Just /feed - main feed
|
||||
}
|
||||
|
||||
// Skip if path contains date patterns (likely article)
|
||||
// Article if path contains date patterns
|
||||
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
||||
return true, "article feed (date pattern)"
|
||||
return "article"
|
||||
}
|
||||
|
||||
// Skip if path has multiple segments (likely article or nested content)
|
||||
// Article if path has multiple segments (nested content)
|
||||
segments := strings.Split(basePath, "/")
|
||||
if len(segments) >= 2 {
|
||||
return true, "article feed (nested path)"
|
||||
return "article"
|
||||
}
|
||||
|
||||
// Skip if single segment looks like an article slug (contains hyphens, is long)
|
||||
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
|
||||
return true, "article feed (slug pattern)"
|
||||
// Article if single segment looks like an article slug
|
||||
if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
|
||||
return "article"
|
||||
}
|
||||
}
|
||||
|
||||
return false, ""
|
||||
return "main"
|
||||
}
|
||||
|
||||
// classifyFeedByTitle refines category based on feed title (called after parsing)
|
||||
func classifyFeedByTitle(title string, currentCategory string) string {
|
||||
if currentCategory != "main" {
|
||||
return currentCategory // Already classified by URL
|
||||
}
|
||||
lower := strings.ToLower(title)
|
||||
if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
|
||||
return "comments"
|
||||
}
|
||||
return currentCategory
|
||||
}
|
||||
|
||||
// Enclosure represents a media attachment (audio, video, image)
|
||||
type Enclosure struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||
Length int64 `json:"length"` // Size in bytes
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
@@ -79,12 +113,21 @@ type Item struct {
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
|
||||
// Media attachments
|
||||
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||
|
||||
// Publishing to PDS
|
||||
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||
PublishedUri string `json:"published_uri,omitempty"`
|
||||
}
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // "rss", "atom", or "unknown"
|
||||
Type string `json:"type"` // "rss", "atom", or "unknown"
|
||||
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
@@ -124,23 +167,35 @@ type Feed struct {
|
||||
|
||||
// Adaptive check interval
|
||||
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
||||
|
||||
// Publishing to PDS
|
||||
PublishStatus string `json:"publish_status"` // "held", "pass", "fail"
|
||||
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
|
||||
}
|
||||
|
||||
// saveFeed stores a feed in SQLite
|
||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
// Default publishStatus to "held" if not set
|
||||
publishStatus := feed.PublishStatus
|
||||
if publishStatus == "" {
|
||||
publishStatus = "held"
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO feeds (
|
||||
url, type, title, description, language, siteUrl,
|
||||
url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
type = excluded.type,
|
||||
category = excluded.category,
|
||||
title = excluded.title,
|
||||
description = excluded.description,
|
||||
language = excluded.language,
|
||||
@@ -161,9 +216,11 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
avgPostFreqHrs = excluded.avgPostFreqHrs,
|
||||
oldestItemDate = excluded.oldestItemDate,
|
||||
newestItemDate = excluded.newestItemDate,
|
||||
noUpdate = excluded.noUpdate
|
||||
noUpdate = excluded.noUpdate,
|
||||
publishStatus = excluded.publishStatus,
|
||||
publishAccount = excluded.publishAccount
|
||||
`,
|
||||
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
|
||||
feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description),
|
||||
nullString(feed.Language), nullString(feed.SiteURL),
|
||||
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
|
||||
nullString(feed.ETag), nullString(feed.LastModified),
|
||||
@@ -172,6 +229,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
|
||||
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
|
||||
feed.NoUpdate,
|
||||
publishStatus, nullString(feed.PublishAccount),
|
||||
)
|
||||
return err
|
||||
}
|
||||
@@ -179,23 +237,25 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
// getFeed retrieves a feed from SQLite
|
||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var category, title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
var publishStatus, publishAccount sql.NullString
|
||||
|
||||
err := c.db.QueryRow(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds WHERE url = ?
|
||||
`, normalizeURL(feedURL)).Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
@@ -203,6 +263,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
@@ -213,6 +274,11 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
}
|
||||
|
||||
// Handle nullable fields
|
||||
if category.Valid {
|
||||
feed.Category = category.String
|
||||
} else {
|
||||
feed.Category = "main" // Default
|
||||
}
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
@@ -267,6 +333,14 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
if publishStatus.Valid {
|
||||
feed.PublishStatus = publishStatus.String
|
||||
} else {
|
||||
feed.PublishStatus = "held"
|
||||
}
|
||||
if publishAccount.Valid {
|
||||
feed.PublishAccount = publishAccount.String
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
@@ -281,14 +355,15 @@ func (c *Crawler) feedExists(feedURL string) bool {
|
||||
// GetAllFeeds returns all feeds from the database
|
||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
`)
|
||||
if err != nil {
|
||||
@@ -316,14 +391,15 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
||||
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
|
||||
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
|
||||
ORDER BY RANDOM()
|
||||
@@ -340,14 +416,15 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
// GetFeedsByHost returns all feeds from a specific host
|
||||
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds WHERE sourceHost = ?
|
||||
`, host)
|
||||
if err != nil {
|
||||
@@ -361,14 +438,15 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
// SearchFeeds performs a full-text search on feeds
|
||||
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
|
||||
SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl,
|
||||
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
|
||||
f.etag, f.lastModified,
|
||||
f.ttlMinutes, f.updatePeriod, f.updateFreq,
|
||||
f.status, f.errorCount, f.lastError, f.lastErrorAt,
|
||||
f.sourceUrl, f.sourceHost, f.tld,
|
||||
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
|
||||
f.noUpdate
|
||||
f.noUpdate,
|
||||
f.publishEnabled, f.publishAccount
|
||||
FROM feeds f
|
||||
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
||||
WHERE feeds_fts MATCH ?
|
||||
@@ -387,13 +465,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
|
||||
for rows.Next() {
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var category, title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
var publishStatus, publishAccount sql.NullString
|
||||
|
||||
if err := rows.Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
@@ -401,11 +480,17 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle nullable fields
|
||||
if category.Valid {
|
||||
feed.Category = category.String
|
||||
} else {
|
||||
feed.Category = "main"
|
||||
}
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
@@ -460,6 +545,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
if publishStatus.Valid {
|
||||
feed.PublishStatus = publishStatus.String
|
||||
} else {
|
||||
feed.PublishStatus = "held"
|
||||
}
|
||||
if publishAccount.Valid {
|
||||
feed.PublishAccount = publishAccount.String
|
||||
}
|
||||
|
||||
feeds = append(feeds, feed)
|
||||
}
|
||||
@@ -469,9 +562,27 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
|
||||
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
||||
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
||||
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON sql.NullString
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
||||
}
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
@@ -479,11 +590,16 @@ func (c *Crawler) saveItem(item *Item) error {
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
updatedAt = excluded.updatedAt,
|
||||
enclosureUrl = excluded.enclosureUrl,
|
||||
enclosureType = excluded.enclosureType,
|
||||
enclosureLength = excluded.enclosureLength,
|
||||
imageUrls = excluded.imageUrls
|
||||
`,
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
||||
)
|
||||
return err
|
||||
}
|
||||
@@ -501,8 +617,9 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.Prepare(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
@@ -510,7 +627,11 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
updatedAt = excluded.updatedAt,
|
||||
enclosureUrl = excluded.enclosureUrl,
|
||||
enclosureType = excluded.enclosureType,
|
||||
enclosureLength = excluded.enclosureLength,
|
||||
imageUrls = excluded.imageUrls
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -521,10 +642,29 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
||||
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
||||
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON sql.NullString
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
||||
}
|
||||
}
|
||||
|
||||
_, err := stmt.Exec(
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
@@ -537,7 +677,9 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
||||
publishedAt, publishedUri
|
||||
FROM items
|
||||
WHERE feedUrl = ?
|
||||
ORDER BY pubDate DESC
|
||||
@@ -548,55 +690,15 @@ func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if guid.Valid {
|
||||
item.GUID = guid.String
|
||||
}
|
||||
if title.Valid {
|
||||
item.Title = title.String
|
||||
}
|
||||
if link.Valid {
|
||||
item.Link = link.String
|
||||
}
|
||||
if description.Valid {
|
||||
item.Description = description.String
|
||||
}
|
||||
if content.Valid {
|
||||
item.Content = content.String
|
||||
}
|
||||
if author.Valid {
|
||||
item.Author = author.String
|
||||
}
|
||||
if pubDate.Valid {
|
||||
item.PubDate = pubDate.Time
|
||||
}
|
||||
if updatedAt.Valid {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
|
||||
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt,
|
||||
i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls,
|
||||
i.publishedAt, i.publishedUri
|
||||
FROM items i
|
||||
JOIN items_fts fts ON i.id = fts.rowid
|
||||
WHERE items_fts MATCH ?
|
||||
@@ -608,16 +710,27 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// scanItems is a helper to scan multiple item rows
|
||||
func scanItems(rows *sql.Rows) ([]*Item, error) {
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
var pubDate, updatedAt, publishedAt sql.NullTime
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
var imageUrlsJSON sql.NullString
|
||||
var publishedUri sql.NullString
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
|
||||
&publishedAt, &publishedUri,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
@@ -647,6 +760,32 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
// Parse enclosure
|
||||
if enclosureUrl.Valid && enclosureUrl.String != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: enclosureUrl.String,
|
||||
Type: enclosureType.String,
|
||||
}
|
||||
if enclosureLength.Valid {
|
||||
item.Enclosure.Length = enclosureLength.Int64
|
||||
}
|
||||
}
|
||||
|
||||
// Parse imageUrls JSON
|
||||
if imageUrlsJSON.Valid && imageUrlsJSON.String != "" {
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil {
|
||||
item.ImageURLs = urls
|
||||
}
|
||||
}
|
||||
|
||||
if publishedAt.Valid {
|
||||
item.PublishedAt = publishedAt.Time
|
||||
}
|
||||
if publishedUri.Valid {
|
||||
item.PublishedUri = publishedUri.String
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
@@ -667,10 +806,6 @@ func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
if strings.Contains(feedURL, "/comment") {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
@@ -690,6 +825,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
@@ -708,6 +844,9 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
@@ -723,11 +862,6 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Skip comment, category, and article feeds
|
||||
if skip, _ := shouldSkipFeed(feedURL); skip {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
@@ -746,6 +880,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
@@ -896,3 +1031,103 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail')
|
||||
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
||||
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
||||
feedURL = normalizeURL(feedURL)
|
||||
|
||||
// Auto-derive account if passing and not provided
|
||||
if status == "pass" && account == "" {
|
||||
account = DeriveHandleFromFeed(feedURL)
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ?
|
||||
`, status, nullString(account), feedURL)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetFeedsByPublishStatus returns all feeds with a specific publish status
|
||||
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE publishStatus = ?
|
||||
`, status)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetPublishCandidates returns feeds that are held review and have items
|
||||
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active'
|
||||
ORDER BY itemCount DESC
|
||||
LIMIT ?
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
||||
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
||||
publishedAt, publishedUri
|
||||
FROM items
|
||||
WHERE feedUrl = ? AND publishedAt IS NULL
|
||||
ORDER BY pubDate ASC
|
||||
LIMIT ?
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// MarkItemPublished marks an item as published with the given URI
|
||||
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ?
|
||||
`, uri, itemID)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
||||
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow(`
|
||||
SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL
|
||||
`, feedURL).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -17,7 +19,10 @@ func main() {
|
||||
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer crawler.Close()
|
||||
|
||||
// Setup graceful shutdown
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
// Start dashboard in background
|
||||
go func() {
|
||||
@@ -41,9 +46,24 @@ func main() {
|
||||
// Stats loop (background) - updates once per minute
|
||||
go crawler.StartStatsLoop()
|
||||
|
||||
// Cleanup loop (background) - removes old items once per hour
|
||||
// Cleanup loop (background) - removes old items once per week
|
||||
go crawler.StartCleanupLoop()
|
||||
|
||||
// Crawl loop (foreground - blocks forever)
|
||||
crawler.StartCrawlLoop()
|
||||
// Maintenance loop (background) - WAL checkpoints and integrity checks
|
||||
go crawler.StartMaintenanceLoop()
|
||||
|
||||
// Crawl loop (background)
|
||||
go crawler.StartCrawlLoop()
|
||||
|
||||
// Wait for shutdown signal
|
||||
sig := <-sigChan
|
||||
fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig)
|
||||
|
||||
// Close crawler (checkpoints WAL and closes database)
|
||||
if err := crawler.Close(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Println("Shutdown complete")
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -23,17 +24,52 @@ type RSSChannel struct {
|
||||
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
|
||||
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
|
||||
Items []RSSItem `xml:"item"`
|
||||
// iTunes podcast namespace
|
||||
ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
|
||||
ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
|
||||
ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
|
||||
ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
|
||||
}
|
||||
|
||||
type RSSItem struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
GUID string `xml:"guid"`
|
||||
Description string `xml:"description"`
|
||||
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||
Author string `xml:"author"`
|
||||
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
GUID string `xml:"guid"`
|
||||
Description string `xml:"description"`
|
||||
Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
||||
Author string `xml:"author"`
|
||||
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Enclosure *RSSEnclosure `xml:"enclosure"`
|
||||
// iTunes item elements
|
||||
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
|
||||
ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
|
||||
ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
|
||||
// Media RSS elements
|
||||
MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
|
||||
MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
|
||||
}
|
||||
|
||||
// MediaContent represents a media:content element
|
||||
type MediaContent struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
Medium string `xml:"medium,attr"` // image, video, audio
|
||||
Width int `xml:"width,attr"`
|
||||
Height int `xml:"height,attr"`
|
||||
}
|
||||
|
||||
// MediaThumbnail represents a media:thumbnail element
|
||||
type MediaThumbnail struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Width int `xml:"width,attr"`
|
||||
Height int `xml:"height,attr"`
|
||||
}
|
||||
|
||||
type RSSEnclosure struct {
|
||||
URL string `xml:"url,attr"`
|
||||
Type string `xml:"type,attr"`
|
||||
Length int64 `xml:"length,attr"`
|
||||
}
|
||||
|
||||
// Atom structs for parsing
|
||||
@@ -70,6 +106,43 @@ type AtomLink struct {
|
||||
Type string `xml:"type,attr"`
|
||||
}
|
||||
|
||||
// isPodcast checks if an RSS feed is a podcast based on content
|
||||
func isPodcast(ch RSSChannel) bool {
|
||||
// Check for iTunes namespace elements at channel level
|
||||
if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
|
||||
ch.ITunesExplicit != "" || ch.ITunesType != "" {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check items for audio enclosures or iTunes elements
|
||||
audioCount := 0
|
||||
for _, item := range ch.Items {
|
||||
// Check for iTunes duration or episode number
|
||||
if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
|
||||
return true
|
||||
}
|
||||
// Check for audio/video enclosure
|
||||
if item.Enclosure != nil && item.Enclosure.URL != "" {
|
||||
mimeType := strings.ToLower(item.Enclosure.Type)
|
||||
if strings.HasPrefix(mimeType, "audio/") ||
|
||||
strings.HasPrefix(mimeType, "video/") ||
|
||||
strings.Contains(mimeType, "mpeg") ||
|
||||
strings.Contains(mimeType, "mp3") ||
|
||||
strings.Contains(mimeType, "mp4") ||
|
||||
strings.Contains(mimeType, "m4a") ||
|
||||
strings.Contains(mimeType, "ogg") {
|
||||
audioCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
// If more than half the items have audio enclosures, it's a podcast
|
||||
if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
var rss RSS
|
||||
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
|
||||
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
}
|
||||
|
||||
ch := rss.Channel
|
||||
|
||||
feed.Title = ch.Title
|
||||
feed.Description = ch.Description
|
||||
feed.Language = ch.Language
|
||||
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
feed.UpdateFreq = ch.UpdateFreq
|
||||
feed.ItemCount = len(ch.Items)
|
||||
|
||||
// Detect podcast
|
||||
if isPodcast(ch) {
|
||||
feed.Category = "podcast"
|
||||
}
|
||||
|
||||
// Parse lastBuildDate
|
||||
if ch.LastBuildDate != "" {
|
||||
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
|
||||
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
|
||||
}
|
||||
}
|
||||
|
||||
// Map enclosure
|
||||
if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: rssItem.Enclosure.URL,
|
||||
Type: rssItem.Enclosure.Type,
|
||||
Length: rssItem.Enclosure.Length,
|
||||
}
|
||||
}
|
||||
|
||||
// Extract images from various sources
|
||||
item.ImageURLs = extractItemImages(rssItem)
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
|
||||
// Default: crawl every 6 hours
|
||||
return now.Add(6 * time.Hour)
|
||||
}
|
||||
|
||||
// extractItemImages extracts image URLs from an RSS item
|
||||
// Sources: media:content, media:thumbnail, iTunes image, and <img> tags in HTML
|
||||
func extractItemImages(rssItem RSSItem) []string {
|
||||
seen := make(map[string]bool)
|
||||
var images []string
|
||||
|
||||
addImage := func(url string) {
|
||||
url = strings.TrimSpace(url)
|
||||
if url == "" || seen[url] {
|
||||
return
|
||||
}
|
||||
// Basic validation
|
||||
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
|
||||
return
|
||||
}
|
||||
seen[url] = true
|
||||
images = append(images, url)
|
||||
}
|
||||
|
||||
// 1. Media RSS content (prefer larger images)
|
||||
for _, mc := range rssItem.MediaContent {
|
||||
if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
|
||||
addImage(mc.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Media RSS thumbnails
|
||||
for _, mt := range rssItem.MediaThumbnail {
|
||||
if mt.URL != "" {
|
||||
addImage(mt.URL)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. iTunes image
|
||||
if rssItem.ITunesImage != "" {
|
||||
addImage(rssItem.ITunesImage)
|
||||
}
|
||||
|
||||
// 4. Image enclosure
|
||||
if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
|
||||
addImage(rssItem.Enclosure.URL)
|
||||
}
|
||||
|
||||
// 5. Extract <img> tags from description and content
|
||||
htmlImages := extractImgTags(rssItem.Description)
|
||||
htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
|
||||
for _, img := range htmlImages {
|
||||
addImage(img)
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// extractImgTags extracts src URLs from <img> tags in HTML
|
||||
func extractImgTags(html string) []string {
|
||||
if html == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
var urls []string
|
||||
|
||||
// Simple regex to find img src attributes
|
||||
// Matches: src="..." or src='...'
|
||||
imgRegex := regexp.MustCompile(`<img[^>]+src\s*=\s*["']([^"']+)["']`)
|
||||
matches := imgRegex.FindAllStringSubmatch(html, -1)
|
||||
|
||||
for _, match := range matches {
|
||||
if len(match) > 1 {
|
||||
url := strings.TrimSpace(match[1])
|
||||
// Skip data URIs, tracking pixels, and tiny images
|
||||
if strings.HasPrefix(url, "data:") {
|
||||
continue
|
||||
}
|
||||
// Skip common tracking/spacer images
|
||||
if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
|
||||
strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
|
||||
continue
|
||||
}
|
||||
urls = append(urls, url)
|
||||
}
|
||||
}
|
||||
|
||||
return urls
|
||||
}
|
||||
|
||||
+909
@@ -0,0 +1,909 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/base32"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Publisher handles posting items to AT Protocol PDS
|
||||
type Publisher struct {
|
||||
pdsHost string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
// PDSSession holds authentication info for a PDS account
|
||||
type PDSSession struct {
|
||||
DID string `json:"did"`
|
||||
Handle string `json:"handle"`
|
||||
AccessJwt string `json:"accessJwt"`
|
||||
RefreshJwt string `json:"refreshJwt"`
|
||||
}
|
||||
|
||||
// BskyPost represents an app.bsky.feed.post record
|
||||
type BskyPost struct {
|
||||
Type string `json:"$type"`
|
||||
Text string `json:"text"`
|
||||
CreatedAt string `json:"createdAt"`
|
||||
Facets []BskyFacet `json:"facets,omitempty"`
|
||||
Embed *BskyEmbed `json:"embed,omitempty"`
|
||||
}
|
||||
|
||||
type BskyFacet struct {
|
||||
Index BskyByteSlice `json:"index"`
|
||||
Features []BskyFeature `json:"features"`
|
||||
}
|
||||
|
||||
type BskyByteSlice struct {
|
||||
ByteStart int `json:"byteStart"`
|
||||
ByteEnd int `json:"byteEnd"`
|
||||
}
|
||||
|
||||
type BskyFeature struct {
|
||||
Type string `json:"$type"`
|
||||
URI string `json:"uri,omitempty"`
|
||||
}
|
||||
|
||||
type BskyEmbed struct {
|
||||
Type string `json:"$type"`
|
||||
External *BskyExternal `json:"external,omitempty"`
|
||||
Images []BskyImage `json:"images,omitempty"`
|
||||
}
|
||||
|
||||
type BskyExternal struct {
|
||||
URI string `json:"uri"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Thumb *BlobRef `json:"thumb,omitempty"`
|
||||
}
|
||||
|
||||
type BskyImage struct {
|
||||
Alt string `json:"alt"`
|
||||
Image *BlobRef `json:"image"`
|
||||
}
|
||||
|
||||
// NewPublisher creates a new Publisher instance
|
||||
func NewPublisher(pdsHost string) *Publisher {
|
||||
return &Publisher{
|
||||
pdsHost: pdsHost,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// CreateSession authenticates with the PDS and returns a session
|
||||
func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
|
||||
payload := map[string]string{
|
||||
"identifier": handle,
|
||||
"password": password,
|
||||
}
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Post(
|
||||
p.pdsHost+"/xrpc/com.atproto.server.createSession",
|
||||
"application/json",
|
||||
bytes.NewReader(body),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var session PDSSession
|
||||
if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &session, nil
|
||||
}
|
||||
|
||||
// CreateAccount creates a new account on the PDS
|
||||
// Requires an invite code if the PDS has invites enabled
|
||||
func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
|
||||
payload := map[string]interface{}{
|
||||
"handle": handle,
|
||||
"email": email,
|
||||
"password": password,
|
||||
}
|
||||
if inviteCode != "" {
|
||||
payload["inviteCode"] = inviteCode
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Post(
|
||||
p.pdsHost+"/xrpc/com.atproto.server.createAccount",
|
||||
"application/json",
|
||||
bytes.NewReader(body),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var session PDSSession
|
||||
if err := json.Unmarshal(respBody, &session); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &session, nil
|
||||
}
|
||||
|
||||
// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
|
||||
func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
|
||||
payload := map[string]interface{}{
|
||||
"useCount": useCount,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
// PDS admin APIs use Basic Auth with "admin" as username
|
||||
req.SetBasicAuth("admin", adminPassword)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Code string `json:"code"`
|
||||
}
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result.Code, nil
|
||||
}
|
||||
|
||||
// GenerateRkey creates a deterministic rkey from a GUID and timestamp
|
||||
// Uses a truncated base32-encoded SHA256 hash
|
||||
// Including the timestamp allows regenerating a new rkey by updating discoveredAt
|
||||
func GenerateRkey(guid string, timestamp time.Time) string {
|
||||
if guid == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Combine GUID with timestamp for the hash input
|
||||
// Format timestamp to second precision for consistency
|
||||
input := guid + "|" + timestamp.UTC().Format(time.RFC3339)
|
||||
hash := sha256.Sum256([]byte(input))
|
||||
// Use first 10 bytes (80 bits) - plenty for uniqueness
|
||||
// Base32 encode without padding, lowercase for rkey compatibility
|
||||
encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(hash[:10])
|
||||
return strings.ToLower(encoded)
|
||||
}
|
||||
|
||||
// extractURLs finds all URLs in a string
|
||||
func extractURLs(text string) []string {
|
||||
// Match http:// or https:// URLs
|
||||
urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`)
|
||||
matches := urlRegex.FindAllString(text, -1)
|
||||
|
||||
// Clean up trailing punctuation
|
||||
var urls []string
|
||||
for _, u := range matches {
|
||||
// Remove trailing punctuation that's likely not part of the URL
|
||||
u = strings.TrimRight(u, ".,;:!?")
|
||||
if u != "" {
|
||||
urls = append(urls, u)
|
||||
}
|
||||
}
|
||||
return urls
|
||||
}
|
||||
|
||||
// PublishItem posts a feed item to the PDS
|
||||
// Returns the AT URI of the created record, or error
|
||||
func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
|
||||
if item.GUID == "" && item.Link == "" {
|
||||
return "", fmt.Errorf("item has no GUID or link, cannot publish")
|
||||
}
|
||||
|
||||
// Collect all unique URLs: main link + any URLs in description
|
||||
urlSet := make(map[string]bool)
|
||||
var allURLs []string
|
||||
|
||||
// Add main link first
|
||||
if item.Link != "" {
|
||||
urlSet[item.Link] = true
|
||||
allURLs = append(allURLs, item.Link)
|
||||
}
|
||||
|
||||
// Add enclosure URL for podcasts/media (audio/video)
|
||||
if item.Enclosure != nil && item.Enclosure.URL != "" {
|
||||
encType := strings.ToLower(item.Enclosure.Type)
|
||||
if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") {
|
||||
if !urlSet[item.Enclosure.URL] {
|
||||
urlSet[item.Enclosure.URL] = true
|
||||
allURLs = append(allURLs, item.Enclosure.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract URLs from description
|
||||
descURLs := extractURLs(item.Description)
|
||||
for _, u := range descURLs {
|
||||
if !urlSet[u] {
|
||||
urlSet[u] = true
|
||||
allURLs = append(allURLs, u)
|
||||
}
|
||||
}
|
||||
|
||||
// Extract URLs from content if available
|
||||
contentURLs := extractURLs(item.Content)
|
||||
for _, u := range contentURLs {
|
||||
if !urlSet[u] {
|
||||
urlSet[u] = true
|
||||
allURLs = append(allURLs, u)
|
||||
}
|
||||
}
|
||||
|
||||
// Build post text: title + all links
|
||||
// Bluesky has 300 grapheme limit
|
||||
var textBuilder strings.Builder
|
||||
textBuilder.WriteString(item.Title)
|
||||
|
||||
for _, u := range allURLs {
|
||||
textBuilder.WriteString("\n\n")
|
||||
textBuilder.WriteString(u)
|
||||
}
|
||||
|
||||
text := textBuilder.String()
|
||||
|
||||
// Truncate title if text is too long (keep URLs intact)
|
||||
const maxLen = 300
|
||||
if len(text) > maxLen {
|
||||
// Calculate space needed for URLs
|
||||
urlSpace := 0
|
||||
for _, u := range allURLs {
|
||||
urlSpace += len(u) + 2 // +2 for \n\n
|
||||
}
|
||||
|
||||
maxTitleLen := maxLen - urlSpace - 3 // -3 for "..."
|
||||
if maxTitleLen > 10 {
|
||||
text = item.Title[:maxTitleLen] + "..."
|
||||
for _, u := range allURLs {
|
||||
text += "\n\n" + u
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use item's pubDate for createdAt, fall back to now
|
||||
createdAt := time.Now()
|
||||
if !item.PubDate.IsZero() {
|
||||
createdAt = item.PubDate
|
||||
}
|
||||
|
||||
post := BskyPost{
|
||||
Type: "app.bsky.feed.post",
|
||||
Text: text,
|
||||
CreatedAt: createdAt.Format(time.RFC3339),
|
||||
}
|
||||
|
||||
// Add facets for all URLs
|
||||
for _, u := range allURLs {
|
||||
linkStart := strings.Index(text, u)
|
||||
if linkStart >= 0 {
|
||||
// Use byte positions (for UTF-8 this matters)
|
||||
byteStart := len(text[:linkStart])
|
||||
byteEnd := byteStart + len(u)
|
||||
|
||||
post.Facets = append(post.Facets, BskyFacet{
|
||||
Index: BskyByteSlice{
|
||||
ByteStart: byteStart,
|
||||
ByteEnd: byteEnd,
|
||||
},
|
||||
Features: []BskyFeature{
|
||||
{
|
||||
Type: "app.bsky.richtext.facet#link",
|
||||
URI: u,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Decide embed type based on content
|
||||
// Priority: images > external link card
|
||||
if len(item.ImageURLs) > 0 {
|
||||
// Try to upload images (up to 4)
|
||||
uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
|
||||
if len(uploadedImages) > 0 {
|
||||
post.Embed = &BskyEmbed{
|
||||
Type: "app.bsky.embed.images",
|
||||
Images: uploadedImages,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to external embed if no images were uploaded
|
||||
if post.Embed == nil && len(allURLs) > 0 {
|
||||
external := &BskyExternal{
|
||||
URI: allURLs[0],
|
||||
Title: item.Title,
|
||||
Description: truncate(stripHTML(item.Description), 300),
|
||||
}
|
||||
|
||||
// Try to add thumbnail from first image
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
|
||||
external.Thumb = thumb
|
||||
}
|
||||
}
|
||||
|
||||
post.Embed = &BskyEmbed{
|
||||
Type: "app.bsky.embed.external",
|
||||
External: external,
|
||||
}
|
||||
}
|
||||
|
||||
// Use GUID + discoveredAt for deterministic rkey
|
||||
// This allows regenerating a new rkey by updating discoveredAt if needed
|
||||
guidForRkey := item.GUID
|
||||
if guidForRkey == "" {
|
||||
guidForRkey = item.Link
|
||||
}
|
||||
rkey := GenerateRkey(guidForRkey, item.DiscoveredAt)
|
||||
|
||||
// Create the record with deterministic rkey
|
||||
payload := map[string]interface{}{
|
||||
"repo": session.DID,
|
||||
"collection": "app.bsky.feed.post",
|
||||
"rkey": rkey,
|
||||
"record": post,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
URI string `json:"uri"`
|
||||
CID string `json:"cid"`
|
||||
}
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result.URI, nil
|
||||
}
|
||||
|
||||
// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
|
||||
func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
|
||||
var images []BskyImage
|
||||
maxImages := 4
|
||||
if len(imageURLs) < maxImages {
|
||||
maxImages = len(imageURLs)
|
||||
}
|
||||
|
||||
for i := 0; i < maxImages; i++ {
|
||||
blob := p.fetchAndUploadImage(session, imageURLs[i])
|
||||
if blob != nil {
|
||||
images = append(images, BskyImage{
|
||||
Alt: altText,
|
||||
Image: blob,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// fetchAndUploadImage downloads an image and uploads it to the PDS
|
||||
func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
|
||||
// Fetch the image
|
||||
resp, err := p.httpClient.Get(imageURL)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check content type
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if contentType == "" {
|
||||
// Try to guess from URL
|
||||
if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
|
||||
contentType = "image/png"
|
||||
} else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
|
||||
contentType = "image/gif"
|
||||
} else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
|
||||
contentType = "image/webp"
|
||||
} else {
|
||||
contentType = "image/jpeg" // Default
|
||||
}
|
||||
}
|
||||
|
||||
// Only accept image types
|
||||
if !strings.HasPrefix(contentType, "image/") {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read image data (limit to 1MB to avoid issues)
|
||||
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
|
||||
if err != nil || len(data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Upload to PDS
|
||||
blob, err := p.UploadBlob(session, data, contentType)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return blob
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen-3] + "..."
|
||||
}
|
||||
|
||||
// stripHTML removes HTML tags from a string
|
||||
func stripHTML(s string) string {
|
||||
// Remove HTML tags
|
||||
tagRegex := regexp.MustCompile(`<[^>]*>`)
|
||||
s = tagRegex.ReplaceAllString(s, "")
|
||||
|
||||
// Decode common HTML entities
|
||||
s = strings.ReplaceAll(s, "&", "&")
|
||||
s = strings.ReplaceAll(s, "<", "<")
|
||||
s = strings.ReplaceAll(s, ">", ">")
|
||||
s = strings.ReplaceAll(s, """, "\"")
|
||||
s = strings.ReplaceAll(s, "'", "'")
|
||||
s = strings.ReplaceAll(s, " ", " ")
|
||||
|
||||
// Collapse whitespace
|
||||
spaceRegex := regexp.MustCompile(`\s+`)
|
||||
s = spaceRegex.ReplaceAllString(s, " ")
|
||||
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
|
||||
// Format: {combined-path-and-hostname}.1440.news
|
||||
// The PDS limits subdomains to 18 characters, so we prioritize meaningful parts
|
||||
// Example: news.ycombinator.com/showrss → show-ycombinator.1440.news
|
||||
func DeriveHandleFromFeed(feedURL string) string {
|
||||
const maxSubdomainLen = 18
|
||||
|
||||
// Ensure we have a scheme for parsing
|
||||
if !strings.Contains(feedURL, "://") {
|
||||
feedURL = "https://" + feedURL
|
||||
}
|
||||
|
||||
u, err := url.Parse(feedURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
hostname := strings.ToLower(u.Hostname())
|
||||
path := strings.ToLower(u.Path)
|
||||
|
||||
// Remove common feed suffixes/extensions
|
||||
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
|
||||
for _, suffix := range suffixesToRemove {
|
||||
path = strings.TrimSuffix(path, suffix)
|
||||
}
|
||||
|
||||
// Split path into segments
|
||||
segments := strings.Split(strings.Trim(path, "/"), "/")
|
||||
|
||||
// Filter out common feed-related words
|
||||
skipWords := map[string]bool{
|
||||
"rss": true, "feed": true, "feeds": true, "atom": true,
|
||||
"xml": true, "default": true, "index": true, "services": true,
|
||||
"nyt": true, // NYTimes uses /services/xml/rss/nyt/
|
||||
}
|
||||
|
||||
var pathParts []string
|
||||
for _, seg := range segments {
|
||||
seg = cleanHandleSegment(seg)
|
||||
if seg != "" && !skipWords[seg] {
|
||||
pathParts = append(pathParts, seg)
|
||||
}
|
||||
}
|
||||
|
||||
// Split hostname into parts, drop common TLDs to save space
|
||||
hostParts := strings.Split(hostname, ".")
|
||||
commonTLDs := map[string]bool{
|
||||
"com": true, "org": true, "net": true, "io": true, "co": true,
|
||||
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
|
||||
}
|
||||
|
||||
// Remove TLD if it's common (to save characters)
|
||||
if len(hostParts) > 1 && commonTLDs[hostParts[len(hostParts)-1]] {
|
||||
hostParts = hostParts[:len(hostParts)-1]
|
||||
}
|
||||
|
||||
// Build subdomain: path parts first (they differentiate feeds), then host parts
|
||||
// Priority order for fitting in 18 chars:
|
||||
// 1. Main hostname part (e.g., "ycombinator")
|
||||
// 2. Path prefix (e.g., "show")
|
||||
// 3. Hostname subdomain (e.g., "news")
|
||||
|
||||
var subdomain string
|
||||
|
||||
// Start with the main hostname (usually the second-to-last part, or first if only one)
|
||||
mainHost := hostParts[len(hostParts)-1]
|
||||
if len(hostParts) > 1 {
|
||||
mainHost = hostParts[len(hostParts)-1] // e.g., "ycombinator" from "news.ycombinator"
|
||||
}
|
||||
|
||||
// If path parts exist, prepend them
|
||||
if len(pathParts) > 0 {
|
||||
subdomain = pathParts[0] + "-" + mainHost
|
||||
} else if len(hostParts) > 1 {
|
||||
// No path, use subdomain-hostname (e.g., "news-ycombinator")
|
||||
subdomain = hostParts[0] + "-" + mainHost
|
||||
} else {
|
||||
subdomain = mainHost
|
||||
}
|
||||
|
||||
// If still too long, just use main hostname
|
||||
if len(subdomain) > maxSubdomainLen {
|
||||
subdomain = mainHost
|
||||
}
|
||||
|
||||
// Final safety: truncate if still too long
|
||||
if len(subdomain) > maxSubdomainLen {
|
||||
subdomain = subdomain[:maxSubdomainLen]
|
||||
}
|
||||
|
||||
subdomain = strings.Trim(subdomain, "-")
|
||||
|
||||
// Collapse multiple hyphens
|
||||
for strings.Contains(subdomain, "--") {
|
||||
subdomain = strings.ReplaceAll(subdomain, "--", "-")
|
||||
}
|
||||
|
||||
return subdomain + ".1440.news"
|
||||
}
|
||||
|
||||
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
|
||||
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
|
||||
func cleanHandleSegment(s string) string {
|
||||
// Remove file extensions
|
||||
if idx := strings.LastIndex(s, "."); idx > 0 {
|
||||
s = s[:idx]
|
||||
}
|
||||
|
||||
// Convert to lowercase
|
||||
s = strings.ToLower(s)
|
||||
|
||||
// Strip common feed prefixes/suffixes from the segment itself
|
||||
// e.g., "showrss" → "show", "rssworld" → "world"
|
||||
feedAffixes := []string{"rss", "feed", "atom", "xml"}
|
||||
for _, affix := range feedAffixes {
|
||||
// Strip suffix (e.g., "showrss" → "show")
|
||||
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
|
||||
s = strings.TrimSuffix(s, affix)
|
||||
break
|
||||
}
|
||||
// Strip prefix (e.g., "rssworld" → "world")
|
||||
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
|
||||
s = strings.TrimPrefix(s, affix)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Replace underscores and other separators with hyphens
|
||||
s = strings.ReplaceAll(s, "_", "-")
|
||||
s = strings.ReplaceAll(s, " ", "-")
|
||||
|
||||
// Remove any characters that aren't alphanumeric or hyphens
|
||||
reg := regexp.MustCompile(`[^a-z0-9-]`)
|
||||
s = reg.ReplaceAllString(s, "")
|
||||
|
||||
// Collapse multiple hyphens
|
||||
for strings.Contains(s, "--") {
|
||||
s = strings.ReplaceAll(s, "--", "-")
|
||||
}
|
||||
|
||||
// Trim leading/trailing hyphens
|
||||
s = strings.Trim(s, "-")
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// SplitHandle extracts the path prefix and hostname from a derived handle
|
||||
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
|
||||
func SplitHandle(handle string) (prefix string, hostname string) {
|
||||
// Remove .1440.news suffix
|
||||
handle = strings.TrimSuffix(handle, ".1440.news")
|
||||
|
||||
parts := strings.Split(handle, ".")
|
||||
|
||||
// Try to find where hostname starts by looking for valid hostname patterns
|
||||
if len(parts) >= 2 {
|
||||
for i := 0; i < len(parts)-1; i++ {
|
||||
remaining := strings.Join(parts[i:], ".")
|
||||
if looksLikeHostname(remaining) {
|
||||
if i > 0 {
|
||||
prefix = strings.Join(parts[:i], ".")
|
||||
}
|
||||
hostname = remaining
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: no prefix, entire thing is hostname
|
||||
hostname = handle
|
||||
return "", hostname
|
||||
}
|
||||
|
||||
func isLikelyTLDPart(s string) bool {
|
||||
tlds := map[string]bool{
|
||||
"com": true, "org": true, "net": true, "edu": true, "gov": true,
|
||||
"io": true, "co": true, "uk": true, "de": true, "fr": true,
|
||||
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
|
||||
"news": true, "blog": true, "tech": true, "dev": true,
|
||||
}
|
||||
return tlds[s]
|
||||
}
|
||||
|
||||
func isTwoPartTLD(first, second string) bool {
|
||||
twoPartTLDs := map[string]bool{
|
||||
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
|
||||
"org.uk": true, "net.au": true, "com.br": true,
|
||||
}
|
||||
return twoPartTLDs[first+"."+second]
|
||||
}
|
||||
|
||||
func looksLikeHostname(s string) bool {
|
||||
// A hostname typically has at least one dot and ends with a TLD-like part
|
||||
parts := strings.Split(s, ".")
|
||||
if len(parts) < 2 {
|
||||
return false
|
||||
}
|
||||
lastPart := parts[len(parts)-1]
|
||||
return isLikelyTLDPart(lastPart)
|
||||
}
|
||||
|
||||
// BlobRef represents a blob reference for profile images
|
||||
type BlobRef struct {
|
||||
Type string `json:"$type"`
|
||||
Ref Link `json:"ref"`
|
||||
MimeType string `json:"mimeType"`
|
||||
Size int64 `json:"size"`
|
||||
}
|
||||
|
||||
type Link struct {
|
||||
Link string `json:"$link"`
|
||||
}
|
||||
|
||||
// UploadBlob uploads an image to the PDS and returns a blob reference
|
||||
func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", mimeType)
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Blob BlobRef `json:"blob"`
|
||||
}
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result.Blob, nil
|
||||
}
|
||||
|
||||
// UpdateProfile updates the profile for an account
|
||||
func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
|
||||
// First, get the current profile to preserve any existing fields
|
||||
getReq, err := http.NewRequest("GET",
|
||||
p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
|
||||
nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
getResp, err := p.httpClient.Do(getReq)
|
||||
|
||||
var existingCID string
|
||||
profile := map[string]interface{}{
|
||||
"$type": "app.bsky.actor.profile",
|
||||
}
|
||||
|
||||
if err == nil && getResp.StatusCode == http.StatusOK {
|
||||
defer getResp.Body.Close()
|
||||
var existing struct {
|
||||
CID string `json:"cid"`
|
||||
Value map[string]interface{} `json:"value"`
|
||||
}
|
||||
if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
|
||||
existingCID = existing.CID
|
||||
profile = existing.Value
|
||||
}
|
||||
} else if getResp != nil {
|
||||
getResp.Body.Close()
|
||||
}
|
||||
|
||||
// Update fields
|
||||
if displayName != "" {
|
||||
profile["displayName"] = displayName
|
||||
}
|
||||
if description != "" {
|
||||
profile["description"] = description
|
||||
}
|
||||
if avatar != nil {
|
||||
profile["avatar"] = avatar
|
||||
}
|
||||
|
||||
// Put the record
|
||||
payload := map[string]interface{}{
|
||||
"repo": session.DID,
|
||||
"collection": "app.bsky.actor.profile",
|
||||
"rkey": "self",
|
||||
"record": profile,
|
||||
}
|
||||
if existingCID != "" {
|
||||
payload["swapRecord"] = existingCID
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// FetchFavicon downloads a favicon/icon from a URL
|
||||
func FetchFavicon(siteURL string) ([]byte, string, error) {
|
||||
// Try common favicon locations
|
||||
if !strings.HasPrefix(siteURL, "http") {
|
||||
siteURL = "https://" + siteURL
|
||||
}
|
||||
|
||||
u, err := url.Parse(siteURL)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
baseURL := u.Scheme + "://" + u.Host
|
||||
|
||||
// Try apple-touch-icon first (usually higher quality)
|
||||
iconURLs := []string{
|
||||
baseURL + "/apple-touch-icon.png",
|
||||
baseURL + "/apple-touch-icon-precomposed.png",
|
||||
baseURL + "/favicon.png",
|
||||
baseURL + "/favicon.ico",
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
for _, iconURL := range iconURLs {
|
||||
resp, err := client.Get(iconURL)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
continue
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Determine mime type
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if contentType == "" {
|
||||
if strings.HasSuffix(iconURL, ".png") {
|
||||
contentType = "image/png"
|
||||
} else if strings.HasSuffix(iconURL, ".ico") {
|
||||
contentType = "image/x-icon"
|
||||
} else {
|
||||
contentType = "image/png" // default
|
||||
}
|
||||
}
|
||||
|
||||
return data, contentType, nil
|
||||
}
|
||||
|
||||
return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
|
||||
}
|
||||
@@ -53,3 +53,31 @@ td { font-size: 13px; color: #ffffff; }
|
||||
#searchInput::placeholder { color: #555; }
|
||||
.search-host { margin-bottom: 10px; }
|
||||
.search-feed:hover { background: #1a1a1a; }
|
||||
|
||||
/* Command buttons */
|
||||
.cmd-btn {
|
||||
background: #1a1a1a;
|
||||
border: 1px solid #333;
|
||||
border-radius: 4px;
|
||||
color: #0af;
|
||||
padding: 6px 12px;
|
||||
margin-right: 8px;
|
||||
margin-bottom: 4px;
|
||||
font-size: 13px;
|
||||
font-family: monospace;
|
||||
cursor: pointer;
|
||||
transition: background 0.2s, border-color 0.2s;
|
||||
}
|
||||
.cmd-btn:hover {
|
||||
background: #252525;
|
||||
border-color: #0af;
|
||||
}
|
||||
.cmd-btn:active {
|
||||
background: #0af;
|
||||
color: #000;
|
||||
}
|
||||
|
||||
/* Visit link */
|
||||
.visit-link:hover {
|
||||
color: #0af !important;
|
||||
}
|
||||
|
||||
+566
-431
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user