-
-
Loading...
+
+
+
Last updated: {{.UpdatedAt.Format "2006-01-02 15:04:05"}}
diff --git a/db.go b/db.go
index 0e49fd2..03d48cb 100644
--- a/db.go
+++ b/db.go
@@ -3,6 +3,7 @@ package main
import (
"database/sql"
"fmt"
+ "time"
_ "modernc.org/sqlite"
)
@@ -25,6 +26,7 @@ CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WH
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
+ category TEXT DEFAULT 'main',
title TEXT,
description TEXT,
language TEXT,
@@ -56,14 +58,20 @@ CREATE TABLE IF NOT EXISTS feeds (
oldestItemDate DATETIME,
newestItemDate DATETIME,
- noUpdate INTEGER DEFAULT 0
+ noUpdate INTEGER DEFAULT 0,
+
+ -- Publishing to PDS
+ publishStatus TEXT DEFAULT 'held' CHECK(publishStatus IN ('held', 'pass', 'fail')),
+ publishAccount TEXT
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
+CREATE INDEX IF NOT EXISTS idx_feeds_publishStatus ON feeds(publishStatus);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
+CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
@@ -80,6 +88,17 @@ CREATE TABLE IF NOT EXISTS items (
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
+
+ -- Media attachments
+ enclosureUrl TEXT,
+ enclosureType TEXT,
+ enclosureLength INTEGER,
+ imageUrls TEXT, -- JSON array of image URLs
+
+ -- Publishing to PDS
+ publishedAt DATETIME,
+ publishedUri TEXT,
+
UNIQUE(feedUrl, guid)
);
@@ -87,6 +106,7 @@ CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
+CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feedUrl, publishedAt) WHERE publishedAt IS NULL;
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
@@ -148,15 +168,22 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
- connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
+ // - busy_timeout: wait up to 10s for locks instead of failing immediately
+ // - journal_mode: WAL for better concurrency and crash recovery
+ // - synchronous: NORMAL is safe with WAL (fsync at checkpoint, not every commit)
+ // - wal_autocheckpoint: checkpoint every 1000 pages (~4MB) to prevent WAL bloat
+ // - foreign_keys: enforce referential integrity
+ connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=wal_autocheckpoint(1000)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
- // Allow multiple readers (WAL mode supports concurrent reads)
- // SQLite is single-writer, but reads can happen concurrently
- db.SetMaxOpenConns(4)
+ // Connection pool settings for stability
+ db.SetMaxOpenConns(4) // Limit concurrent connections
+ db.SetMaxIdleConns(2) // Keep some connections warm
+ db.SetConnMaxLifetime(5 * time.Minute) // Recycle connections periodically
+ db.SetConnMaxIdleTime(1 * time.Minute) // Close idle connections
// Verify connection and show journal mode
var journalMode string
@@ -173,6 +200,17 @@ func OpenDatabase(dbPath string) (*sql.DB, error) {
}
fmt.Println(" Schema OK")
+ // Migrations for existing databases
+ migrations := []string{
+ "ALTER TABLE items ADD COLUMN enclosureUrl TEXT",
+ "ALTER TABLE items ADD COLUMN enclosureType TEXT",
+ "ALTER TABLE items ADD COLUMN enclosureLength INTEGER",
+ "ALTER TABLE items ADD COLUMN imageUrls TEXT",
+ }
+ for _, m := range migrations {
+ db.Exec(m) // Ignore errors (column may already exist)
+ }
+
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
diff --git a/docker-compose.yml b/docker-compose.yml
index 7f53c81..c067351 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,6 +3,7 @@ services:
build: .
container_name: app-1440-news
restart: unless-stopped
+ stop_grace_period: 30s
env_file:
- pds.env
volumes:
diff --git a/domain.go b/domain.go
index d4197fb..86186eb 100644
--- a/domain.go
+++ b/domain.go
@@ -88,26 +88,12 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
return domain, nil
}
-// GetUncheckedDomains returns all domains with status "unchecked"
-func (c *Crawler) GetUncheckedDomains() ([]*Domain, error) {
+// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO)
+func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE status = 'unchecked'
- `)
- if err != nil {
- return nil, err
- }
- defer rows.Close()
-
- return c.scanDomains(rows)
-}
-
-// GetUncheckedDomainsRandom returns up to limit unchecked domains in random order
-func (c *Crawler) GetUncheckedDomainsRandom(limit int) ([]*Domain, error) {
- rows, err := c.db.Query(`
- SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
- FROM domains WHERE status = 'unchecked'
- ORDER BY RANDOM()
+ ORDER BY discoveredAt ASC
LIMIT ?
`, limit)
if err != nil {
@@ -224,7 +210,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
- const batchSize = 10000
+ const batchSize = 1000
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
totalImported := 0
diff --git a/feed.go b/feed.go
index cb83d74..697be82 100644
--- a/feed.go
+++ b/feed.go
@@ -2,6 +2,7 @@ package main
import (
"database/sql"
+ "encoding/json"
"fmt"
"io"
"net/http"
@@ -12,58 +13,91 @@ import (
"time"
)
-// shouldSkipFeed checks if a feed URL should be filtered out
-// Returns true (and a reason) if the feed should be skipped
-func shouldSkipFeed(feedURL string) (bool, string) {
+// classifyFeed determines the category of a feed based on URL patterns
+// Returns: "main", "comments", "category", "author", "article", "podcast"
+// Note: podcast detection is also done in parseRSSMetadata based on content
+func classifyFeed(feedURL string) string {
lower := strings.ToLower(feedURL)
- // Skip explicit comment feeds
+ // Comment feeds
if strings.Contains(lower, "/comment") {
- return true, "comment feed"
+ return "comments"
+ }
+
+ // Podcast URL patterns
+ podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
+ for _, pattern := range podcastPatterns {
+ if strings.Contains(lower, pattern) {
+ return "podcast"
+ }
}
u, err := url.Parse(feedURL)
if err != nil {
- return false, ""
+ return "main"
}
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
- // Skip category/tag feeds
- categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
+ // Author feeds
+ if strings.Contains(path, "/author/") {
+ return "author"
+ }
+
+ // Category/tag feeds
+ categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
for _, pattern := range categoryPatterns {
if strings.Contains(path, pattern) {
- return true, "category/tag feed"
+ return "category"
}
}
- // Check for article comment feeds (path ending in /feed with content before it)
+ // Check for article feeds (path ending in /feed with content before it)
if strings.HasSuffix(path, "/feed") {
basePath := strings.TrimSuffix(path, "/feed")
basePath = strings.Trim(basePath, "/")
if basePath == "" {
- return false, "" // Just /feed - legitimate main feed
+ return "main" // Just /feed - main feed
}
- // Skip if path contains date patterns (likely article)
+ // Article if path contains date patterns
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
- return true, "article feed (date pattern)"
+ return "article"
}
- // Skip if path has multiple segments (likely article or nested content)
+ // Article if path has multiple segments (nested content)
segments := strings.Split(basePath, "/")
if len(segments) >= 2 {
- return true, "article feed (nested path)"
+ return "article"
}
- // Skip if single segment looks like an article slug (contains hyphens, is long)
- if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
- return true, "article feed (slug pattern)"
+ // Article if single segment looks like an article slug
+ if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
+ return "article"
}
}
- return false, ""
+ return "main"
+}
+
+// classifyFeedByTitle refines category based on feed title (called after parsing)
+func classifyFeedByTitle(title string, currentCategory string) string {
+ if currentCategory != "main" {
+ return currentCategory // Already classified by URL
+ }
+ lower := strings.ToLower(title)
+ if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
+ return "comments"
+ }
+ return currentCategory
+}
+
+// Enclosure represents a media attachment (audio, video, image)
+type Enclosure struct {
+ URL string `json:"url"`
+ Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
+ Length int64 `json:"length"` // Size in bytes
}
// Item represents an individual entry/article from a feed
@@ -79,12 +113,21 @@ type Item struct {
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
+
+ // Media attachments
+ Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
+ ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
+
+ // Publishing to PDS
+ PublishedAt time.Time `json:"published_at,omitempty"`
+ PublishedUri string `json:"published_uri,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
- Type string `json:"type"` // "rss", "atom", or "unknown"
+ Type string `json:"type"` // "rss", "atom", or "unknown"
+ Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
@@ -124,23 +167,35 @@ type Feed struct {
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
+
+ // Publishing to PDS
+ PublishStatus string `json:"publish_status"` // "held", "pass", "fail"
+ PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
}
// saveFeed stores a feed in SQLite
func (c *Crawler) saveFeed(feed *Feed) error {
+ // Default publishStatus to "held" if not set
+ publishStatus := feed.PublishStatus
+ if publishStatus == "" {
+ publishStatus = "held"
+ }
+
_, err := c.db.Exec(`
INSERT INTO feeds (
- url, type, title, description, language, siteUrl,
+ url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
- noUpdate
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ noUpdate,
+ publishStatus, publishAccount
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
type = excluded.type,
+ category = excluded.category,
title = excluded.title,
description = excluded.description,
language = excluded.language,
@@ -161,9 +216,11 @@ func (c *Crawler) saveFeed(feed *Feed) error {
avgPostFreqHrs = excluded.avgPostFreqHrs,
oldestItemDate = excluded.oldestItemDate,
newestItemDate = excluded.newestItemDate,
- noUpdate = excluded.noUpdate
+ noUpdate = excluded.noUpdate,
+ publishStatus = excluded.publishStatus,
+ publishAccount = excluded.publishAccount
`,
- feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
+ feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description),
nullString(feed.Language), nullString(feed.SiteURL),
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
nullString(feed.ETag), nullString(feed.LastModified),
@@ -172,6 +229,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
feed.NoUpdate,
+ publishStatus, nullString(feed.PublishAccount),
)
return err
}
@@ -179,23 +237,25 @@ func (c *Crawler) saveFeed(feed *Feed) error {
// getFeed retrieves a feed from SQLite
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed := &Feed{}
- var title, description, language, siteURL sql.NullString
+ var category, title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
+ var publishStatus, publishAccount sql.NullString
err := c.db.QueryRow(`
- SELECT url, type, title, description, language, siteUrl,
+ SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
- noUpdate
+ noUpdate,
+ publishStatus, publishAccount
FROM feeds WHERE url = ?
`, normalizeURL(feedURL)).Scan(
- &feed.URL, &feed.Type, &title, &description, &language, &siteURL,
+ &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
@@ -203,6 +263,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
+ &publishStatus, &publishAccount,
)
if err == sql.ErrNoRows {
@@ -213,6 +274,11 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
}
// Handle nullable fields
+ if category.Valid {
+ feed.Category = category.String
+ } else {
+ feed.Category = "main" // Default
+ }
if title.Valid {
feed.Title = title.String
}
@@ -267,6 +333,14 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
+ if publishStatus.Valid {
+ feed.PublishStatus = publishStatus.String
+ } else {
+ feed.PublishStatus = "held"
+ }
+ if publishAccount.Valid {
+ feed.PublishAccount = publishAccount.String
+ }
return feed, nil
}
@@ -281,14 +355,15 @@ func (c *Crawler) feedExists(feedURL string) bool {
// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
rows, err := c.db.Query(`
- SELECT url, type, title, description, language, siteUrl,
+ SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
- noUpdate
+ noUpdate,
+ publishStatus, publishAccount
FROM feeds
`)
if err != nil {
@@ -316,14 +391,15 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
- SELECT url, type, title, description, language, siteUrl,
+ SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
- noUpdate
+ noUpdate,
+ publishStatus, publishAccount
FROM feeds
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
ORDER BY RANDOM()
@@ -340,14 +416,15 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
rows, err := c.db.Query(`
- SELECT url, type, title, description, language, siteUrl,
+ SELECT url, type, category, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
- noUpdate
+ noUpdate,
+ publishStatus, publishAccount
FROM feeds WHERE sourceHost = ?
`, host)
if err != nil {
@@ -361,14 +438,15 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
rows, err := c.db.Query(`
- SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
+ SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl,
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
f.etag, f.lastModified,
f.ttlMinutes, f.updatePeriod, f.updateFreq,
f.status, f.errorCount, f.lastError, f.lastErrorAt,
f.sourceUrl, f.sourceHost, f.tld,
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
- f.noUpdate
+ f.noUpdate,
+ f.publishEnabled, f.publishAccount
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
@@ -387,13 +465,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
for rows.Next() {
feed := &Feed{}
- var title, description, language, siteURL sql.NullString
+ var category, title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
+ var publishStatus, publishAccount sql.NullString
if err := rows.Scan(
- &feed.URL, &feed.Type, &title, &description, &language, &siteURL,
+ &feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
@@ -401,11 +480,17 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
+ &publishStatus, &publishAccount,
); err != nil {
continue
}
// Handle nullable fields
+ if category.Valid {
+ feed.Category = category.String
+ } else {
+ feed.Category = "main"
+ }
if title.Valid {
feed.Title = title.String
}
@@ -460,6 +545,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
+ if publishStatus.Valid {
+ feed.PublishStatus = publishStatus.String
+ } else {
+ feed.PublishStatus = "held"
+ }
+ if publishAccount.Valid {
+ feed.PublishAccount = publishAccount.String
+ }
feeds = append(feeds, feed)
}
@@ -469,9 +562,27 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
func (c *Crawler) saveItem(item *Item) error {
+ // Serialize enclosure fields
+ var enclosureUrl, enclosureType sql.NullString
+ var enclosureLength sql.NullInt64
+ if item.Enclosure != nil {
+ enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
+ enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
+ enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
+ }
+
+ // Serialize imageUrls as JSON
+ var imageUrlsJSON sql.NullString
+ if len(item.ImageURLs) > 0 {
+ if data, err := json.Marshal(item.ImageURLs); err == nil {
+ imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
+ }
+ }
+
_, err := c.db.Exec(`
- INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
+ enclosureUrl, enclosureType, enclosureLength, imageUrls)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
@@ -479,11 +590,16 @@ func (c *Crawler) saveItem(item *Item) error {
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
- updatedAt = excluded.updatedAt
+ updatedAt = excluded.updatedAt,
+ enclosureUrl = excluded.enclosureUrl,
+ enclosureType = excluded.enclosureType,
+ enclosureLength = excluded.enclosureLength,
+ imageUrls = excluded.imageUrls
`,
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
+ enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
)
return err
}
@@ -501,8 +617,9 @@ func (c *Crawler) saveItems(items []*Item) error {
defer tx.Rollback()
stmt, err := tx.Prepare(`
- INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
+ enclosureUrl, enclosureType, enclosureLength, imageUrls)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
@@ -510,7 +627,11 @@ func (c *Crawler) saveItems(items []*Item) error {
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
- updatedAt = excluded.updatedAt
+ updatedAt = excluded.updatedAt,
+ enclosureUrl = excluded.enclosureUrl,
+ enclosureType = excluded.enclosureType,
+ enclosureLength = excluded.enclosureLength,
+ imageUrls = excluded.imageUrls
`)
if err != nil {
return err
@@ -521,10 +642,29 @@ func (c *Crawler) saveItems(items []*Item) error {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
+
+ // Serialize enclosure fields
+ var enclosureUrl, enclosureType sql.NullString
+ var enclosureLength sql.NullInt64
+ if item.Enclosure != nil {
+ enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
+ enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
+ enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
+ }
+
+ // Serialize imageUrls as JSON
+ var imageUrlsJSON sql.NullString
+ if len(item.ImageURLs) > 0 {
+ if data, err := json.Marshal(item.ImageURLs); err == nil {
+ imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
+ }
+ }
+
_, err := stmt.Exec(
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
+ enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
)
if err != nil {
continue // Skip failed items
@@ -537,7 +677,9 @@ func (c *Crawler) saveItems(items []*Item) error {
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
- SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
+ SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
+ enclosureUrl, enclosureType, enclosureLength, imageUrls,
+ publishedAt, publishedUri
FROM items
WHERE feedUrl = ?
ORDER BY pubDate DESC
@@ -548,55 +690,15 @@ func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
}
defer rows.Close()
- var items []*Item
- for rows.Next() {
- item := &Item{}
- var guid, title, link, description, content, author sql.NullString
- var pubDate, updatedAt sql.NullTime
-
- if err := rows.Scan(
- &item.ID, &item.FeedURL, &guid, &title, &link,
- &description, &content, &author, &pubDate,
- &item.DiscoveredAt, &updatedAt,
- ); err != nil {
- continue
- }
-
- if guid.Valid {
- item.GUID = guid.String
- }
- if title.Valid {
- item.Title = title.String
- }
- if link.Valid {
- item.Link = link.String
- }
- if description.Valid {
- item.Description = description.String
- }
- if content.Valid {
- item.Content = content.String
- }
- if author.Valid {
- item.Author = author.String
- }
- if pubDate.Valid {
- item.PubDate = pubDate.Time
- }
- if updatedAt.Valid {
- item.UpdatedAt = updatedAt.Time
- }
-
- items = append(items, item)
- }
-
- return items, rows.Err()
+ return scanItems(rows)
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
- SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
+ SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt,
+ i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls,
+ i.publishedAt, i.publishedUri
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
@@ -608,16 +710,27 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
}
defer rows.Close()
+ return scanItems(rows)
+}
+
+// scanItems is a helper to scan multiple item rows
+func scanItems(rows *sql.Rows) ([]*Item, error) {
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
- var pubDate, updatedAt sql.NullTime
+ var pubDate, updatedAt, publishedAt sql.NullTime
+ var enclosureUrl, enclosureType sql.NullString
+ var enclosureLength sql.NullInt64
+ var imageUrlsJSON sql.NullString
+ var publishedUri sql.NullString
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
+ &enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
+ &publishedAt, &publishedUri,
); err != nil {
continue
}
@@ -647,6 +760,32 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
item.UpdatedAt = updatedAt.Time
}
+ // Parse enclosure
+ if enclosureUrl.Valid && enclosureUrl.String != "" {
+ item.Enclosure = &Enclosure{
+ URL: enclosureUrl.String,
+ Type: enclosureType.String,
+ }
+ if enclosureLength.Valid {
+ item.Enclosure.Length = enclosureLength.Int64
+ }
+ }
+
+ // Parse imageUrls JSON
+ if imageUrlsJSON.Valid && imageUrlsJSON.String != "" {
+ var urls []string
+ if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil {
+ item.ImageURLs = urls
+ }
+ }
+
+ if publishedAt.Valid {
+ item.PublishedAt = publishedAt.Time
+ }
+ if publishedUri.Valid {
+ item.PublishedUri = publishedUri.String
+ }
+
items = append(items, item)
}
@@ -667,10 +806,6 @@ func (c *Crawler) CleanupOldItems() (int64, error) {
// processFeed parses and stores a feed with full metadata
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
- if strings.Contains(feedURL, "/comment") {
- return
- }
-
// Fast path: check without lock
if c.feedExists(feedURL) {
return
@@ -690,6 +825,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
+ Category: classifyFeed(feedURL),
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
@@ -708,6 +844,9 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
items = c.parseAtomMetadata(body, feed)
}
+ // Refine category based on parsed title (e.g., "Comments on:")
+ feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
+
// Calculate next crawl time
feed.NextCrawlAt = c.calculateNextCrawl(feed)
@@ -723,11 +862,6 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
- // Skip comment, category, and article feeds
- if skip, _ := shouldSkipFeed(feedURL); skip {
- return
- }
-
// Fast path: check without lock
if c.feedExists(feedURL) {
return
@@ -746,6 +880,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
feed := &Feed{
URL: normalizedURL,
Type: feedType,
+ Category: classifyFeed(feedURL),
DiscoveredAt: now,
Status: "active",
SourceURL: normalizeURL(sourceURL),
@@ -896,3 +1031,103 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
return true, nil
}
+
+// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail')
+// If status is 'pass', the account handle is also set (auto-derived if empty)
+func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
+ feedURL = normalizeURL(feedURL)
+
+ // Auto-derive account if passing and not provided
+ if status == "pass" && account == "" {
+ account = DeriveHandleFromFeed(feedURL)
+ }
+
+ _, err := c.db.Exec(`
+ UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ?
+ `, status, nullString(account), feedURL)
+ return err
+}
+
+// GetFeedsByPublishStatus returns all feeds with a specific publish status
+func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
+ rows, err := c.db.Query(`
+ SELECT url, type, category, title, description, language, siteUrl,
+ discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
+ etag, lastModified,
+ ttlMinutes, updatePeriod, updateFreq,
+ status, errorCount, lastError, lastErrorAt,
+ sourceUrl, sourceHost, tld,
+ itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
+ noUpdate,
+ publishStatus, publishAccount
+ FROM feeds
+ WHERE publishStatus = ?
+ `, status)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ return scanFeeds(rows)
+}
+
+// GetPublishCandidates returns feeds that are held review and have items
+func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
+ rows, err := c.db.Query(`
+ SELECT url, type, category, title, description, language, siteUrl,
+ discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
+ etag, lastModified,
+ ttlMinutes, updatePeriod, updateFreq,
+ status, errorCount, lastError, lastErrorAt,
+ sourceUrl, sourceHost, tld,
+ itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
+ noUpdate,
+ publishStatus, publishAccount
+ FROM feeds
+ WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active'
+ ORDER BY itemCount DESC
+ LIMIT ?
+ `, limit)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ return scanFeeds(rows)
+}
+
+// GetUnpublishedItems returns items for a feed that haven't been published yet
+func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
+ rows, err := c.db.Query(`
+ SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
+ enclosureUrl, enclosureType, enclosureLength, imageUrls,
+ publishedAt, publishedUri
+ FROM items
+ WHERE feedUrl = ? AND publishedAt IS NULL
+ ORDER BY pubDate ASC
+ LIMIT ?
+ `, feedURL, limit)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ return scanItems(rows)
+}
+
+// MarkItemPublished marks an item as published with the given URI
+func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
+ _, err := c.db.Exec(`
+ UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ?
+ `, uri, itemID)
+ return err
+}
+
+// GetUnpublishedItemCount returns the count of unpublished items for a feed
+func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
+ var count int
+ err := c.db.QueryRow(`
+ SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL
+ `, feedURL).Scan(&count)
+ return count, err
+}
diff --git a/main.go b/main.go
index f552e5f..b65a7ad 100644
--- a/main.go
+++ b/main.go
@@ -3,6 +3,8 @@ package main
import (
"fmt"
"os"
+ "os/signal"
+ "syscall"
)
func main() {
@@ -17,7 +19,10 @@ func main() {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1)
}
- defer crawler.Close()
+
+ // Setup graceful shutdown
+ sigChan := make(chan os.Signal, 1)
+ signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Start dashboard in background
go func() {
@@ -41,9 +46,24 @@ func main() {
// Stats loop (background) - updates once per minute
go crawler.StartStatsLoop()
- // Cleanup loop (background) - removes old items once per hour
+ // Cleanup loop (background) - removes old items once per week
go crawler.StartCleanupLoop()
- // Crawl loop (foreground - blocks forever)
- crawler.StartCrawlLoop()
+ // Maintenance loop (background) - WAL checkpoints and integrity checks
+ go crawler.StartMaintenanceLoop()
+
+ // Crawl loop (background)
+ go crawler.StartCrawlLoop()
+
+ // Wait for shutdown signal
+ sig := <-sigChan
+ fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig)
+
+ // Close crawler (checkpoints WAL and closes database)
+ if err := crawler.Close(); err != nil {
+ fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err)
+ os.Exit(1)
+ }
+
+ fmt.Println("Shutdown complete")
}
diff --git a/parser.go b/parser.go
index 9b91798..e77c72a 100644
--- a/parser.go
+++ b/parser.go
@@ -3,6 +3,7 @@ package main
import (
"encoding/xml"
"fmt"
+ "regexp"
"strings"
"time"
)
@@ -23,17 +24,52 @@ type RSSChannel struct {
UpdatePeriod string `xml:"http://purl.org/rss/1.0/modules/syndication/ updatePeriod"`
UpdateFreq int `xml:"http://purl.org/rss/1.0/modules/syndication/ updateFrequency"`
Items []RSSItem `xml:"item"`
+ // iTunes podcast namespace
+ ITunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
+ ITunesOwner string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner>name"`
+ ITunesExplicit string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd explicit"`
+ ITunesType string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd type"`
}
type RSSItem struct {
- Title string `xml:"title"`
- Link string `xml:"link"`
- GUID string `xml:"guid"`
- Description string `xml:"description"`
- Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
- Author string `xml:"author"`
- Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
- PubDate string `xml:"pubDate"`
+ Title string `xml:"title"`
+ Link string `xml:"link"`
+ GUID string `xml:"guid"`
+ Description string `xml:"description"`
+ Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
+ Author string `xml:"author"`
+ Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
+ PubDate string `xml:"pubDate"`
+ Enclosure *RSSEnclosure `xml:"enclosure"`
+ // iTunes item elements
+ ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
+ ITunesEpisode int `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd episode"`
+ ITunesImage string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd image>href"`
+ // Media RSS elements
+ MediaContent []MediaContent `xml:"http://search.yahoo.com/mrss/ content"`
+ MediaThumbnail []MediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
+}
+
+// MediaContent represents a media:content element
+type MediaContent struct {
+ URL string `xml:"url,attr"`
+ Type string `xml:"type,attr"`
+ Medium string `xml:"medium,attr"` // image, video, audio
+ Width int `xml:"width,attr"`
+ Height int `xml:"height,attr"`
+}
+
+// MediaThumbnail represents a media:thumbnail element
+type MediaThumbnail struct {
+ URL string `xml:"url,attr"`
+ Width int `xml:"width,attr"`
+ Height int `xml:"height,attr"`
+}
+
+type RSSEnclosure struct {
+ URL string `xml:"url,attr"`
+ Type string `xml:"type,attr"`
+ Length int64 `xml:"length,attr"`
}
// Atom structs for parsing
@@ -70,6 +106,43 @@ type AtomLink struct {
Type string `xml:"type,attr"`
}
+// isPodcast checks if an RSS feed is a podcast based on content
+func isPodcast(ch RSSChannel) bool {
+ // Check for iTunes namespace elements at channel level
+ if ch.ITunesAuthor != "" || ch.ITunesOwner != "" ||
+ ch.ITunesExplicit != "" || ch.ITunesType != "" {
+ return true
+ }
+
+ // Check items for audio enclosures or iTunes elements
+ audioCount := 0
+ for _, item := range ch.Items {
+ // Check for iTunes duration or episode number
+ if item.ITunesDuration != "" || item.ITunesEpisode > 0 {
+ return true
+ }
+ // Check for audio/video enclosure
+ if item.Enclosure != nil && item.Enclosure.URL != "" {
+ mimeType := strings.ToLower(item.Enclosure.Type)
+ if strings.HasPrefix(mimeType, "audio/") ||
+ strings.HasPrefix(mimeType, "video/") ||
+ strings.Contains(mimeType, "mpeg") ||
+ strings.Contains(mimeType, "mp3") ||
+ strings.Contains(mimeType, "mp4") ||
+ strings.Contains(mimeType, "m4a") ||
+ strings.Contains(mimeType, "ogg") {
+ audioCount++
+ }
+ }
+ }
+ // If more than half the items have audio enclosures, it's a podcast
+ if len(ch.Items) > 0 && audioCount > len(ch.Items)/2 {
+ return true
+ }
+
+ return false
+}
+
func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
var rss RSS
if err := xml.Unmarshal([]byte(body), &rss); err != nil {
@@ -77,6 +150,7 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
ch := rss.Channel
+
feed.Title = ch.Title
feed.Description = ch.Description
feed.Language = ch.Language
@@ -86,6 +160,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
+ // Detect podcast
+ if isPodcast(ch) {
+ feed.Category = "podcast"
+ }
+
// Parse lastBuildDate
if ch.LastBuildDate != "" {
if t, err := parseRSSDate(ch.LastBuildDate); err == nil {
@@ -130,6 +209,18 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
}
}
+ // Map enclosure
+ if rssItem.Enclosure != nil && rssItem.Enclosure.URL != "" {
+ item.Enclosure = &Enclosure{
+ URL: rssItem.Enclosure.URL,
+ Type: rssItem.Enclosure.Type,
+ Length: rssItem.Enclosure.Length,
+ }
+ }
+
+ // Extract images from various sources
+ item.ImageURLs = extractItemImages(rssItem)
+
items = append(items, item)
}
@@ -324,3 +415,88 @@ func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
}
+
+// extractItemImages extracts image URLs from an RSS item
+// Sources: media:content, media:thumbnail, iTunes image, and
tags in HTML
+func extractItemImages(rssItem RSSItem) []string {
+ seen := make(map[string]bool)
+ var images []string
+
+ addImage := func(url string) {
+ url = strings.TrimSpace(url)
+ if url == "" || seen[url] {
+ return
+ }
+ // Basic validation
+ if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
+ return
+ }
+ seen[url] = true
+ images = append(images, url)
+ }
+
+ // 1. Media RSS content (prefer larger images)
+ for _, mc := range rssItem.MediaContent {
+ if mc.URL != "" && (mc.Medium == "image" || strings.HasPrefix(mc.Type, "image/")) {
+ addImage(mc.URL)
+ }
+ }
+
+ // 2. Media RSS thumbnails
+ for _, mt := range rssItem.MediaThumbnail {
+ if mt.URL != "" {
+ addImage(mt.URL)
+ }
+ }
+
+ // 3. iTunes image
+ if rssItem.ITunesImage != "" {
+ addImage(rssItem.ITunesImage)
+ }
+
+ // 4. Image enclosure
+ if rssItem.Enclosure != nil && strings.HasPrefix(rssItem.Enclosure.Type, "image/") {
+ addImage(rssItem.Enclosure.URL)
+ }
+
+ // 5. Extract
tags from description and content
+ htmlImages := extractImgTags(rssItem.Description)
+ htmlImages = append(htmlImages, extractImgTags(rssItem.Content)...)
+ for _, img := range htmlImages {
+ addImage(img)
+ }
+
+ return images
+}
+
+// extractImgTags extracts src URLs from
tags in HTML
+func extractImgTags(html string) []string {
+ if html == "" {
+ return nil
+ }
+
+ var urls []string
+
+ // Simple regex to find img src attributes
+ // Matches: src="..." or src='...'
+ imgRegex := regexp.MustCompile(`
]+src\s*=\s*["']([^"']+)["']`)
+ matches := imgRegex.FindAllStringSubmatch(html, -1)
+
+ for _, match := range matches {
+ if len(match) > 1 {
+ url := strings.TrimSpace(match[1])
+ // Skip data URIs, tracking pixels, and tiny images
+ if strings.HasPrefix(url, "data:") {
+ continue
+ }
+ // Skip common tracking/spacer images
+ if strings.Contains(url, "pixel") || strings.Contains(url, "spacer") ||
+ strings.Contains(url, "1x1") || strings.Contains(url, "blank.gif") {
+ continue
+ }
+ urls = append(urls, url)
+ }
+ }
+
+ return urls
+}
diff --git a/publisher.go b/publisher.go
new file mode 100644
index 0000000..d62577c
--- /dev/null
+++ b/publisher.go
@@ -0,0 +1,909 @@
+package main
+
+import (
+ "bytes"
+ "crypto/sha256"
+ "encoding/base32"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strings"
+ "time"
+)
+
+// Publisher handles posting items to AT Protocol PDS
+type Publisher struct {
+ pdsHost string
+ httpClient *http.Client
+}
+
+// PDSSession holds authentication info for a PDS account
+type PDSSession struct {
+ DID string `json:"did"`
+ Handle string `json:"handle"`
+ AccessJwt string `json:"accessJwt"`
+ RefreshJwt string `json:"refreshJwt"`
+}
+
+// BskyPost represents an app.bsky.feed.post record
+type BskyPost struct {
+ Type string `json:"$type"`
+ Text string `json:"text"`
+ CreatedAt string `json:"createdAt"`
+ Facets []BskyFacet `json:"facets,omitempty"`
+ Embed *BskyEmbed `json:"embed,omitempty"`
+}
+
+type BskyFacet struct {
+ Index BskyByteSlice `json:"index"`
+ Features []BskyFeature `json:"features"`
+}
+
+type BskyByteSlice struct {
+ ByteStart int `json:"byteStart"`
+ ByteEnd int `json:"byteEnd"`
+}
+
+type BskyFeature struct {
+ Type string `json:"$type"`
+ URI string `json:"uri,omitempty"`
+}
+
+type BskyEmbed struct {
+ Type string `json:"$type"`
+ External *BskyExternal `json:"external,omitempty"`
+ Images []BskyImage `json:"images,omitempty"`
+}
+
+type BskyExternal struct {
+ URI string `json:"uri"`
+ Title string `json:"title"`
+ Description string `json:"description"`
+ Thumb *BlobRef `json:"thumb,omitempty"`
+}
+
+type BskyImage struct {
+ Alt string `json:"alt"`
+ Image *BlobRef `json:"image"`
+}
+
+// NewPublisher creates a new Publisher instance
+func NewPublisher(pdsHost string) *Publisher {
+ return &Publisher{
+ pdsHost: pdsHost,
+ httpClient: &http.Client{
+ Timeout: 30 * time.Second,
+ },
+ }
+}
+
+// CreateSession authenticates with the PDS and returns a session
+func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
+ payload := map[string]string{
+ "identifier": handle,
+ "password": password,
+ }
+ body, err := json.Marshal(payload)
+ if err != nil {
+ return nil, err
+ }
+
+ resp, err := p.httpClient.Post(
+ p.pdsHost+"/xrpc/com.atproto.server.createSession",
+ "application/json",
+ bytes.NewReader(body),
+ )
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ respBody, _ := io.ReadAll(resp.Body)
+ return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ var session PDSSession
+ if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
+ return nil, err
+ }
+
+ return &session, nil
+}
+
+// CreateAccount creates a new account on the PDS
+// Requires an invite code if the PDS has invites enabled
+func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
+ payload := map[string]interface{}{
+ "handle": handle,
+ "email": email,
+ "password": password,
+ }
+ if inviteCode != "" {
+ payload["inviteCode"] = inviteCode
+ }
+
+ body, err := json.Marshal(payload)
+ if err != nil {
+ return nil, err
+ }
+
+ resp, err := p.httpClient.Post(
+ p.pdsHost+"/xrpc/com.atproto.server.createAccount",
+ "application/json",
+ bytes.NewReader(body),
+ )
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ respBody, _ := io.ReadAll(resp.Body)
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ var session PDSSession
+ if err := json.Unmarshal(respBody, &session); err != nil {
+ return nil, err
+ }
+
+ return &session, nil
+}
+
+// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
+func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
+ payload := map[string]interface{}{
+ "useCount": useCount,
+ }
+
+ body, err := json.Marshal(payload)
+ if err != nil {
+ return "", err
+ }
+
+ req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
+ if err != nil {
+ return "", err
+ }
+ req.Header.Set("Content-Type", "application/json")
+ // PDS admin APIs use Basic Auth with "admin" as username
+ req.SetBasicAuth("admin", adminPassword)
+
+ resp, err := p.httpClient.Do(req)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+
+ respBody, _ := io.ReadAll(resp.Body)
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ var result struct {
+ Code string `json:"code"`
+ }
+ if err := json.Unmarshal(respBody, &result); err != nil {
+ return "", err
+ }
+
+ return result.Code, nil
+}
+
+// GenerateRkey creates a deterministic rkey from a GUID and timestamp
+// Uses a truncated base32-encoded SHA256 hash
+// Including the timestamp allows regenerating a new rkey by updating discoveredAt
+func GenerateRkey(guid string, timestamp time.Time) string {
+ if guid == "" {
+ return ""
+ }
+
+ // Combine GUID with timestamp for the hash input
+ // Format timestamp to second precision for consistency
+ input := guid + "|" + timestamp.UTC().Format(time.RFC3339)
+ hash := sha256.Sum256([]byte(input))
+ // Use first 10 bytes (80 bits) - plenty for uniqueness
+ // Base32 encode without padding, lowercase for rkey compatibility
+ encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(hash[:10])
+ return strings.ToLower(encoded)
+}
+
+// extractURLs finds all URLs in a string
+func extractURLs(text string) []string {
+ // Match http:// or https:// URLs
+ urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`)
+ matches := urlRegex.FindAllString(text, -1)
+
+ // Clean up trailing punctuation
+ var urls []string
+ for _, u := range matches {
+ // Remove trailing punctuation that's likely not part of the URL
+ u = strings.TrimRight(u, ".,;:!?")
+ if u != "" {
+ urls = append(urls, u)
+ }
+ }
+ return urls
+}
+
+// PublishItem posts a feed item to the PDS
+// Returns the AT URI of the created record, or error
+func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
+ if item.GUID == "" && item.Link == "" {
+ return "", fmt.Errorf("item has no GUID or link, cannot publish")
+ }
+
+ // Collect all unique URLs: main link + any URLs in description
+ urlSet := make(map[string]bool)
+ var allURLs []string
+
+ // Add main link first
+ if item.Link != "" {
+ urlSet[item.Link] = true
+ allURLs = append(allURLs, item.Link)
+ }
+
+ // Add enclosure URL for podcasts/media (audio/video)
+ if item.Enclosure != nil && item.Enclosure.URL != "" {
+ encType := strings.ToLower(item.Enclosure.Type)
+ if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") {
+ if !urlSet[item.Enclosure.URL] {
+ urlSet[item.Enclosure.URL] = true
+ allURLs = append(allURLs, item.Enclosure.URL)
+ }
+ }
+ }
+
+ // Extract URLs from description
+ descURLs := extractURLs(item.Description)
+ for _, u := range descURLs {
+ if !urlSet[u] {
+ urlSet[u] = true
+ allURLs = append(allURLs, u)
+ }
+ }
+
+ // Extract URLs from content if available
+ contentURLs := extractURLs(item.Content)
+ for _, u := range contentURLs {
+ if !urlSet[u] {
+ urlSet[u] = true
+ allURLs = append(allURLs, u)
+ }
+ }
+
+ // Build post text: title + all links
+ // Bluesky has 300 grapheme limit
+ var textBuilder strings.Builder
+ textBuilder.WriteString(item.Title)
+
+ for _, u := range allURLs {
+ textBuilder.WriteString("\n\n")
+ textBuilder.WriteString(u)
+ }
+
+ text := textBuilder.String()
+
+ // Truncate title if text is too long (keep URLs intact)
+ const maxLen = 300
+ if len(text) > maxLen {
+ // Calculate space needed for URLs
+ urlSpace := 0
+ for _, u := range allURLs {
+ urlSpace += len(u) + 2 // +2 for \n\n
+ }
+
+ maxTitleLen := maxLen - urlSpace - 3 // -3 for "..."
+ if maxTitleLen > 10 {
+ text = item.Title[:maxTitleLen] + "..."
+ for _, u := range allURLs {
+ text += "\n\n" + u
+ }
+ }
+ }
+
+ // Use item's pubDate for createdAt, fall back to now
+ createdAt := time.Now()
+ if !item.PubDate.IsZero() {
+ createdAt = item.PubDate
+ }
+
+ post := BskyPost{
+ Type: "app.bsky.feed.post",
+ Text: text,
+ CreatedAt: createdAt.Format(time.RFC3339),
+ }
+
+ // Add facets for all URLs
+ for _, u := range allURLs {
+ linkStart := strings.Index(text, u)
+ if linkStart >= 0 {
+ // Use byte positions (for UTF-8 this matters)
+ byteStart := len(text[:linkStart])
+ byteEnd := byteStart + len(u)
+
+ post.Facets = append(post.Facets, BskyFacet{
+ Index: BskyByteSlice{
+ ByteStart: byteStart,
+ ByteEnd: byteEnd,
+ },
+ Features: []BskyFeature{
+ {
+ Type: "app.bsky.richtext.facet#link",
+ URI: u,
+ },
+ },
+ })
+ }
+ }
+
+ // Decide embed type based on content
+ // Priority: images > external link card
+ if len(item.ImageURLs) > 0 {
+ // Try to upload images (up to 4)
+ uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
+ if len(uploadedImages) > 0 {
+ post.Embed = &BskyEmbed{
+ Type: "app.bsky.embed.images",
+ Images: uploadedImages,
+ }
+ }
+ }
+
+ // Fall back to external embed if no images were uploaded
+ if post.Embed == nil && len(allURLs) > 0 {
+ external := &BskyExternal{
+ URI: allURLs[0],
+ Title: item.Title,
+ Description: truncate(stripHTML(item.Description), 300),
+ }
+
+ // Try to add thumbnail from first image
+ if len(item.ImageURLs) > 0 {
+ if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
+ external.Thumb = thumb
+ }
+ }
+
+ post.Embed = &BskyEmbed{
+ Type: "app.bsky.embed.external",
+ External: external,
+ }
+ }
+
+ // Use GUID + discoveredAt for deterministic rkey
+ // This allows regenerating a new rkey by updating discoveredAt if needed
+ guidForRkey := item.GUID
+ if guidForRkey == "" {
+ guidForRkey = item.Link
+ }
+ rkey := GenerateRkey(guidForRkey, item.DiscoveredAt)
+
+ // Create the record with deterministic rkey
+ payload := map[string]interface{}{
+ "repo": session.DID,
+ "collection": "app.bsky.feed.post",
+ "rkey": rkey,
+ "record": post,
+ }
+
+ body, err := json.Marshal(payload)
+ if err != nil {
+ return "", err
+ }
+
+ req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
+ if err != nil {
+ return "", err
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
+
+ resp, err := p.httpClient.Do(req)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+
+ respBody, _ := io.ReadAll(resp.Body)
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ var result struct {
+ URI string `json:"uri"`
+ CID string `json:"cid"`
+ }
+ if err := json.Unmarshal(respBody, &result); err != nil {
+ return "", err
+ }
+
+ return result.URI, nil
+}
+
+// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
+func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
+ var images []BskyImage
+ maxImages := 4
+ if len(imageURLs) < maxImages {
+ maxImages = len(imageURLs)
+ }
+
+ for i := 0; i < maxImages; i++ {
+ blob := p.fetchAndUploadImage(session, imageURLs[i])
+ if blob != nil {
+ images = append(images, BskyImage{
+ Alt: altText,
+ Image: blob,
+ })
+ }
+ }
+
+ return images
+}
+
+// fetchAndUploadImage downloads an image and uploads it to the PDS
+func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
+ // Fetch the image
+ resp, err := p.httpClient.Get(imageURL)
+ if err != nil {
+ return nil
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil
+ }
+
+ // Check content type
+ contentType := resp.Header.Get("Content-Type")
+ if contentType == "" {
+ // Try to guess from URL
+ if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
+ contentType = "image/png"
+ } else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
+ contentType = "image/gif"
+ } else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
+ contentType = "image/webp"
+ } else {
+ contentType = "image/jpeg" // Default
+ }
+ }
+
+ // Only accept image types
+ if !strings.HasPrefix(contentType, "image/") {
+ return nil
+ }
+
+ // Read image data (limit to 1MB to avoid issues)
+ data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
+ if err != nil || len(data) == 0 {
+ return nil
+ }
+
+ // Upload to PDS
+ blob, err := p.UploadBlob(session, data, contentType)
+ if err != nil {
+ return nil
+ }
+
+ return blob
+}
+
+func truncate(s string, maxLen int) string {
+ if len(s) <= maxLen {
+ return s
+ }
+ return s[:maxLen-3] + "..."
+}
+
+// stripHTML removes HTML tags from a string
+func stripHTML(s string) string {
+ // Remove HTML tags
+ tagRegex := regexp.MustCompile(`<[^>]*>`)
+ s = tagRegex.ReplaceAllString(s, "")
+
+ // Decode common HTML entities
+ s = strings.ReplaceAll(s, "&", "&")
+ s = strings.ReplaceAll(s, "<", "<")
+ s = strings.ReplaceAll(s, ">", ">")
+ s = strings.ReplaceAll(s, """, "\"")
+ s = strings.ReplaceAll(s, "'", "'")
+ s = strings.ReplaceAll(s, " ", " ")
+
+ // Collapse whitespace
+ spaceRegex := regexp.MustCompile(`\s+`)
+ s = spaceRegex.ReplaceAllString(s, " ")
+
+ return strings.TrimSpace(s)
+}
+
+// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
+// Format: {combined-path-and-hostname}.1440.news
+// The PDS limits subdomains to 18 characters, so we prioritize meaningful parts
+// Example: news.ycombinator.com/showrss → show-ycombinator.1440.news
+func DeriveHandleFromFeed(feedURL string) string {
+ const maxSubdomainLen = 18
+
+ // Ensure we have a scheme for parsing
+ if !strings.Contains(feedURL, "://") {
+ feedURL = "https://" + feedURL
+ }
+
+ u, err := url.Parse(feedURL)
+ if err != nil {
+ return ""
+ }
+
+ hostname := strings.ToLower(u.Hostname())
+ path := strings.ToLower(u.Path)
+
+ // Remove common feed suffixes/extensions
+ suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
+ for _, suffix := range suffixesToRemove {
+ path = strings.TrimSuffix(path, suffix)
+ }
+
+ // Split path into segments
+ segments := strings.Split(strings.Trim(path, "/"), "/")
+
+ // Filter out common feed-related words
+ skipWords := map[string]bool{
+ "rss": true, "feed": true, "feeds": true, "atom": true,
+ "xml": true, "default": true, "index": true, "services": true,
+ "nyt": true, // NYTimes uses /services/xml/rss/nyt/
+ }
+
+ var pathParts []string
+ for _, seg := range segments {
+ seg = cleanHandleSegment(seg)
+ if seg != "" && !skipWords[seg] {
+ pathParts = append(pathParts, seg)
+ }
+ }
+
+ // Split hostname into parts, drop common TLDs to save space
+ hostParts := strings.Split(hostname, ".")
+ commonTLDs := map[string]bool{
+ "com": true, "org": true, "net": true, "io": true, "co": true,
+ "edu": true, "gov": true, "uk": true, "de": true, "fr": true,
+ }
+
+ // Remove TLD if it's common (to save characters)
+ if len(hostParts) > 1 && commonTLDs[hostParts[len(hostParts)-1]] {
+ hostParts = hostParts[:len(hostParts)-1]
+ }
+
+ // Build subdomain: path parts first (they differentiate feeds), then host parts
+ // Priority order for fitting in 18 chars:
+ // 1. Main hostname part (e.g., "ycombinator")
+ // 2. Path prefix (e.g., "show")
+ // 3. Hostname subdomain (e.g., "news")
+
+ var subdomain string
+
+ // Start with the main hostname (usually the second-to-last part, or first if only one)
+ mainHost := hostParts[len(hostParts)-1]
+ if len(hostParts) > 1 {
+ mainHost = hostParts[len(hostParts)-1] // e.g., "ycombinator" from "news.ycombinator"
+ }
+
+ // If path parts exist, prepend them
+ if len(pathParts) > 0 {
+ subdomain = pathParts[0] + "-" + mainHost
+ } else if len(hostParts) > 1 {
+ // No path, use subdomain-hostname (e.g., "news-ycombinator")
+ subdomain = hostParts[0] + "-" + mainHost
+ } else {
+ subdomain = mainHost
+ }
+
+ // If still too long, just use main hostname
+ if len(subdomain) > maxSubdomainLen {
+ subdomain = mainHost
+ }
+
+ // Final safety: truncate if still too long
+ if len(subdomain) > maxSubdomainLen {
+ subdomain = subdomain[:maxSubdomainLen]
+ }
+
+ subdomain = strings.Trim(subdomain, "-")
+
+ // Collapse multiple hyphens
+ for strings.Contains(subdomain, "--") {
+ subdomain = strings.ReplaceAll(subdomain, "--", "-")
+ }
+
+ return subdomain + ".1440.news"
+}
+
+// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
+// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
+func cleanHandleSegment(s string) string {
+ // Remove file extensions
+ if idx := strings.LastIndex(s, "."); idx > 0 {
+ s = s[:idx]
+ }
+
+ // Convert to lowercase
+ s = strings.ToLower(s)
+
+ // Strip common feed prefixes/suffixes from the segment itself
+ // e.g., "showrss" → "show", "rssworld" → "world"
+ feedAffixes := []string{"rss", "feed", "atom", "xml"}
+ for _, affix := range feedAffixes {
+ // Strip suffix (e.g., "showrss" → "show")
+ if strings.HasSuffix(s, affix) && len(s) > len(affix) {
+ s = strings.TrimSuffix(s, affix)
+ break
+ }
+ // Strip prefix (e.g., "rssworld" → "world")
+ if strings.HasPrefix(s, affix) && len(s) > len(affix) {
+ s = strings.TrimPrefix(s, affix)
+ break
+ }
+ }
+
+ // Replace underscores and other separators with hyphens
+ s = strings.ReplaceAll(s, "_", "-")
+ s = strings.ReplaceAll(s, " ", "-")
+
+ // Remove any characters that aren't alphanumeric or hyphens
+ reg := regexp.MustCompile(`[^a-z0-9-]`)
+ s = reg.ReplaceAllString(s, "")
+
+ // Collapse multiple hyphens
+ for strings.Contains(s, "--") {
+ s = strings.ReplaceAll(s, "--", "-")
+ }
+
+ // Trim leading/trailing hyphens
+ s = strings.Trim(s, "-")
+
+ return s
+}
+
+// SplitHandle extracts the path prefix and hostname from a derived handle
+// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
+func SplitHandle(handle string) (prefix string, hostname string) {
+ // Remove .1440.news suffix
+ handle = strings.TrimSuffix(handle, ".1440.news")
+
+ parts := strings.Split(handle, ".")
+
+ // Try to find where hostname starts by looking for valid hostname patterns
+ if len(parts) >= 2 {
+ for i := 0; i < len(parts)-1; i++ {
+ remaining := strings.Join(parts[i:], ".")
+ if looksLikeHostname(remaining) {
+ if i > 0 {
+ prefix = strings.Join(parts[:i], ".")
+ }
+ hostname = remaining
+ return
+ }
+ }
+ }
+
+ // Fallback: no prefix, entire thing is hostname
+ hostname = handle
+ return "", hostname
+}
+
+func isLikelyTLDPart(s string) bool {
+ tlds := map[string]bool{
+ "com": true, "org": true, "net": true, "edu": true, "gov": true,
+ "io": true, "co": true, "uk": true, "de": true, "fr": true,
+ "jp": true, "au": true, "ca": true, "nl": true, "se": true,
+ "news": true, "blog": true, "tech": true, "dev": true,
+ }
+ return tlds[s]
+}
+
+func isTwoPartTLD(first, second string) bool {
+ twoPartTLDs := map[string]bool{
+ "co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
+ "org.uk": true, "net.au": true, "com.br": true,
+ }
+ return twoPartTLDs[first+"."+second]
+}
+
+func looksLikeHostname(s string) bool {
+ // A hostname typically has at least one dot and ends with a TLD-like part
+ parts := strings.Split(s, ".")
+ if len(parts) < 2 {
+ return false
+ }
+ lastPart := parts[len(parts)-1]
+ return isLikelyTLDPart(lastPart)
+}
+
+// BlobRef represents a blob reference for profile images
+type BlobRef struct {
+ Type string `json:"$type"`
+ Ref Link `json:"ref"`
+ MimeType string `json:"mimeType"`
+ Size int64 `json:"size"`
+}
+
+type Link struct {
+ Link string `json:"$link"`
+}
+
+// UploadBlob uploads an image to the PDS and returns a blob reference
+func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
+ req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
+ if err != nil {
+ return nil, err
+ }
+ req.Header.Set("Content-Type", mimeType)
+ req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
+
+ resp, err := p.httpClient.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ respBody, _ := io.ReadAll(resp.Body)
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ var result struct {
+ Blob BlobRef `json:"blob"`
+ }
+ if err := json.Unmarshal(respBody, &result); err != nil {
+ return nil, err
+ }
+
+ return &result.Blob, nil
+}
+
+// UpdateProfile updates the profile for an account
+func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
+ // First, get the current profile to preserve any existing fields
+ getReq, err := http.NewRequest("GET",
+ p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
+ nil)
+ if err != nil {
+ return err
+ }
+ getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
+
+ getResp, err := p.httpClient.Do(getReq)
+
+ var existingCID string
+ profile := map[string]interface{}{
+ "$type": "app.bsky.actor.profile",
+ }
+
+ if err == nil && getResp.StatusCode == http.StatusOK {
+ defer getResp.Body.Close()
+ var existing struct {
+ CID string `json:"cid"`
+ Value map[string]interface{} `json:"value"`
+ }
+ if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
+ existingCID = existing.CID
+ profile = existing.Value
+ }
+ } else if getResp != nil {
+ getResp.Body.Close()
+ }
+
+ // Update fields
+ if displayName != "" {
+ profile["displayName"] = displayName
+ }
+ if description != "" {
+ profile["description"] = description
+ }
+ if avatar != nil {
+ profile["avatar"] = avatar
+ }
+
+ // Put the record
+ payload := map[string]interface{}{
+ "repo": session.DID,
+ "collection": "app.bsky.actor.profile",
+ "rkey": "self",
+ "record": profile,
+ }
+ if existingCID != "" {
+ payload["swapRecord"] = existingCID
+ }
+
+ body, err := json.Marshal(payload)
+ if err != nil {
+ return err
+ }
+
+ req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
+ if err != nil {
+ return err
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
+
+ resp, err := p.httpClient.Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+
+ respBody, _ := io.ReadAll(resp.Body)
+
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
+ }
+
+ return nil
+}
+
+// FetchFavicon downloads a favicon/icon from a URL
+func FetchFavicon(siteURL string) ([]byte, string, error) {
+ // Try common favicon locations
+ if !strings.HasPrefix(siteURL, "http") {
+ siteURL = "https://" + siteURL
+ }
+
+ u, err := url.Parse(siteURL)
+ if err != nil {
+ return nil, "", err
+ }
+
+ baseURL := u.Scheme + "://" + u.Host
+
+ // Try apple-touch-icon first (usually higher quality)
+ iconURLs := []string{
+ baseURL + "/apple-touch-icon.png",
+ baseURL + "/apple-touch-icon-precomposed.png",
+ baseURL + "/favicon.png",
+ baseURL + "/favicon.ico",
+ }
+
+ client := &http.Client{Timeout: 10 * time.Second}
+
+ for _, iconURL := range iconURLs {
+ resp, err := client.Get(iconURL)
+ if err != nil {
+ continue
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ continue
+ }
+
+ data, err := io.ReadAll(resp.Body)
+ if err != nil {
+ continue
+ }
+
+ // Determine mime type
+ contentType := resp.Header.Get("Content-Type")
+ if contentType == "" {
+ if strings.HasSuffix(iconURL, ".png") {
+ contentType = "image/png"
+ } else if strings.HasSuffix(iconURL, ".ico") {
+ contentType = "image/x-icon"
+ } else {
+ contentType = "image/png" // default
+ }
+ }
+
+ return data, contentType, nil
+ }
+
+ return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
+}
diff --git a/static/dashboard.css b/static/dashboard.css
index c5668cc..f6be04f 100644
--- a/static/dashboard.css
+++ b/static/dashboard.css
@@ -53,3 +53,31 @@ td { font-size: 13px; color: #ffffff; }
#searchInput::placeholder { color: #555; }
.search-host { margin-bottom: 10px; }
.search-feed:hover { background: #1a1a1a; }
+
+/* Command buttons */
+.cmd-btn {
+ background: #1a1a1a;
+ border: 1px solid #333;
+ border-radius: 4px;
+ color: #0af;
+ padding: 6px 12px;
+ margin-right: 8px;
+ margin-bottom: 4px;
+ font-size: 13px;
+ font-family: monospace;
+ cursor: pointer;
+ transition: background 0.2s, border-color 0.2s;
+}
+.cmd-btn:hover {
+ background: #252525;
+ border-color: #0af;
+}
+.cmd-btn:active {
+ background: #0af;
+ color: #000;
+}
+
+/* Visit link */
+.visit-link:hover {
+ color: #0af !important;
+}
diff --git a/static/dashboard.js b/static/dashboard.js
index 796d452..37348d6 100644
--- a/static/dashboard.js
+++ b/static/dashboard.js
@@ -10,508 +10,643 @@ function initDashboard() {
return div.innerHTML;
}
- // All domains state
- let allDomainsOffset = 0;
- let allDomainsLoading = false;
- let allDomainsEnd = false;
- let expandedDomain = null;
- let expandedFeed = null;
- const PAGE_SIZE = 100;
- const PREFETCH_THRESHOLD = 100; // Prefetch when within 100 domains of bottom
+ // Current filter state
+ let currentFilters = {};
+ let infiniteScrollState = null;
+ let isLoadingMore = false;
- // Search state
- let searchTimeout = null;
- let isSearching = false;
+ // Update command input to reflect current filters
+ function updateCommandInput() {
+ const parts = [];
+ if (currentFilters.tld) parts.push('tld:.' + currentFilters.tld);
+ if (currentFilters.domain) parts.push('domain:' + currentFilters.domain);
+ if (currentFilters.feedStatus) parts.push('feeds:' + currentFilters.feedStatus);
+ if (currentFilters.domainStatus) parts.push('domains:' + currentFilters.domainStatus);
+ document.getElementById('commandInput').value = parts.length > 0 ? parts.join(' ') : '/help';
+ }
- async function loadMoreDomains() {
- if (allDomainsLoading || allDomainsEnd) return;
+ // Parse command into filters
+ function parseCommand(cmd) {
+ const filters = {};
+ const parts = cmd.trim().toLowerCase().split(/\s+/);
- allDomainsLoading = true;
- const loadingEl = document.getElementById('allDomainsLoading');
- loadingEl.style.display = 'block';
-
- try {
- const response = await fetch('/api/allDomains?offset=' + allDomainsOffset + '&limit=' + PAGE_SIZE);
- const domains = await response.json();
-
- if (!domains || domains.length === 0) {
- allDomainsEnd = true;
- loadingEl.style.display = 'none';
- return;
+ for (const part of parts) {
+ if (part.startsWith('tld:.') || part.startsWith('tld:')) {
+ filters.tld = part.replace('tld:.', '').replace('tld:', '');
+ } else if (part.startsWith('domain:')) {
+ filters.domain = part.substring(7);
+ } else if (part.startsWith('feeds:')) {
+ filters.feedStatus = part.substring(6);
+ } else if (part.startsWith('domains:')) {
+ filters.domainStatus = part.substring(8);
+ } else if (part === 'active' || part === 'error' || part === 'dead') {
+ // Shorthand for feed status
+ filters.feedStatus = part;
+ } else if (part === 'unchecked' || part === 'checked') {
+ // Shorthand for domain status
+ filters.domainStatus = part;
+ } else if (part === '/tlds' || part === '/tld') {
+ filters.showTlds = true;
+ } else if (part === '/help') {
+ filters.help = true;
}
+ }
- const container = document.getElementById('allDomains');
- domains.forEach(d => {
- const row = document.createElement('div');
- row.className = 'domain-row';
- row.innerHTML =
- '
' +
- '' + escapeHtml(d.host) + ' ' +
- '' + commaFormat(d.feeds_found) + ' ' +
- '
' +
- '
';
+ return filters;
+ }
- row.querySelector('.stat-row').addEventListener('click', () => toggleDomainFeeds(d.host, row));
- container.appendChild(row);
+ // Render breadcrumb based on current filters
+ function renderBreadcrumb() {
+ const breadcrumb = document.getElementById('breadcrumb');
+ const parts = [];
+
+ parts.push('
home ');
+
+ if (currentFilters.tld) {
+ parts.push('
.' + escapeHtml(currentFilters.tld) + ' ');
+ }
+ if (currentFilters.domain) {
+ parts.push('
' + escapeHtml(currentFilters.domain) + ' ');
+ }
+ if (currentFilters.feedStatus) {
+ parts.push('
feeds:' + escapeHtml(currentFilters.feedStatus) + ' ');
+ }
+ if (currentFilters.domainStatus) {
+ parts.push('
domains:' + escapeHtml(currentFilters.domainStatus) + ' ');
+ }
+
+ breadcrumb.innerHTML = parts.join('
/ ');
+ breadcrumb.style.display = parts.length > 1 ? 'block' : 'none';
+
+ // Add click handlers
+ breadcrumb.querySelectorAll('.bc-item').forEach(el => {
+ el.addEventListener('click', () => {
+ const action = el.dataset.action;
+ if (action === 'home') {
+ currentFilters = {};
+ showHelp();
+ } else if (action === 'tld') {
+ delete currentFilters.domain;
+ delete currentFilters.feedStatus;
+ delete currentFilters.domainStatus;
+ executeFilters();
+ } else if (action === 'domain') {
+ delete currentFilters.feedStatus;
+ executeFilters();
+ }
});
+ });
+ }
- allDomainsOffset += domains.length;
- loadingEl.style.display = 'none';
+ // Infinite scroll
+ function setupInfiniteScroll(loadMoreFn) {
+ infiniteScrollState = { loadMore: loadMoreFn, ended: false };
+ }
- // If we got fewer than PAGE_SIZE, we've reached the end
- if (domains.length < PAGE_SIZE) {
- allDomainsEnd = true;
- }
- } catch (err) {
- console.error('Failed to load domains:', err);
- } finally {
- allDomainsLoading = false;
+ function clearInfiniteScroll() {
+ infiniteScrollState = null;
+ }
+
+ function checkInfiniteScroll() {
+ if (!infiniteScrollState || infiniteScrollState.ended || isLoadingMore) return;
+ const scrollBottom = window.scrollY + window.innerHeight;
+ const docHeight = document.documentElement.scrollHeight;
+ if (docHeight - scrollBottom < 300) {
+ isLoadingMore = true;
+ infiniteScrollState.loadMore().finally(() => {
+ isLoadingMore = false;
+ });
}
}
- async function toggleDomainFeeds(host, rowEl) {
- const feedsDiv = rowEl.querySelector('.domain-feeds');
+ window.addEventListener('scroll', checkInfiniteScroll);
- // Close previously expanded domain
- if (expandedDomain && expandedDomain !== rowEl) {
- expandedDomain.querySelector('.domain-feeds').style.display = 'none';
+ // Render helpers
+ function renderDomainRow(d) {
+ let html = '
';
+ html += '
';
+ html += '
' + escapeHtml(d.host) + ' ';
+ html += '
↗ ';
+ html += '
' + commaFormat(d.feed_count) + ' feeds ';
+ html += '
revisit ';
+ html += '
';
+ if (d.status === 'error' && d.last_error) {
+ html += '
Error: ' + escapeHtml(d.last_error) + '
';
+ } else if (d.status === 'unchecked') {
+ html += '
Pending...
';
+ }
+ html += '
';
+ return html;
+ }
+
+ function renderFeedRow(f) {
+ let html = '
';
+ html += '
';
+ html += '
' + escapeHtml(f.url) + ' ';
+ html += '
↗ ';
+ html += '
';
+ if (f.title) html += '
' + escapeHtml(f.title) + '
';
+ if (f.source_host) {
+ html += '
from ' + escapeHtml(f.source_host) + ' ';
+ html += '
↗ ';
+ }
+ let statusParts = [f.type || 'unknown'];
+ if (f.status) {
+ const color = f.status === 'active' ? '#0a0' : (f.status === 'error' ? '#f66' : '#888');
+ statusParts.push('
' + escapeHtml(f.status) + ' ');
+ }
+ if (f.item_count > 0) statusParts.push(commaFormat(f.item_count) + ' items');
+ html += '
' + statusParts.join(' · ') + '
';
+ if (f.error_count > 0 && f.last_error) {
+ html += '
Error (' + f.error_count + '): ' + escapeHtml(f.last_error) + '
';
+ }
+ html += '
';
+ return html;
+ }
+
+ function renderTldRow(t) {
+ return '
' +
+ '.' + escapeHtml(t.tld) + ' ' +
+ '' + commaFormat(t.domain_count) + ' domains, ' + commaFormat(t.feed_count) + ' feeds
';
+ }
+
+ function attachDomainHandlers(container) {
+ container.querySelectorAll('.domain-row-cmd:not(.handled)').forEach(el => {
+ el.classList.add('handled');
+ el.querySelector('.domain-name').addEventListener('click', () => {
+ currentFilters.domain = el.dataset.host;
+ if (!currentFilters.tld && el.dataset.tld) currentFilters.tld = el.dataset.tld;
+ delete currentFilters.domainStatus;
+ executeFilters();
+ });
+ el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a');
+ el.addEventListener('mouseleave', () => el.style.background = 'transparent');
+ const btn = el.querySelector('.revisit-btn');
+ if (btn) {
+ btn.addEventListener('click', async (e) => {
+ e.stopPropagation();
+ btn.disabled = true;
+ btn.textContent = '...';
+ try {
+ await fetch('/api/revisitDomain?host=' + encodeURIComponent(btn.dataset.host));
+ btn.textContent = 'queued';
+ btn.style.color = '#0a0';
+ } catch (err) {
+ btn.textContent = 'error';
+ btn.style.color = '#f66';
+ }
+ });
+ }
+ });
+ }
+
+ function attachFeedHandlers(container) {
+ container.querySelectorAll('.feed-row-cmd:not(.handled)').forEach(el => {
+ el.classList.add('handled');
+ el.querySelector('.feed-name').addEventListener('click', () => {
+ showFeedInfo(el.dataset.url);
+ });
+ el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a');
+ el.addEventListener('mouseleave', () => el.style.background = 'transparent');
+ });
+ }
+
+ function attachTldHandlers(container) {
+ container.querySelectorAll('.tld-row:not(.handled)').forEach(el => {
+ el.classList.add('handled');
+ el.addEventListener('click', () => {
+ currentFilters = { tld: el.dataset.tld };
+ executeFilters();
+ });
+ el.addEventListener('mouseenter', () => el.style.background = '#1a1a1a');
+ el.addEventListener('mouseleave', () => el.style.background = 'transparent');
+ });
+ }
+
+ // Show help
+ function showHelp() {
+ currentFilters = {};
+ clearInfiniteScroll();
+ updateCommandInput();
+ renderBreadcrumb();
+ document.getElementById('output').innerHTML = '
Type a command and press Enter Examples: tld:.com tld:.com active tld:.com domains:error domain:example.com
';
+ }
+
+ // Show TLDs list
+ async function showTLDs() {
+ currentFilters = {};
+ clearInfiniteScroll();
+ updateCommandInput();
+ document.getElementById('commandInput').value = '/tlds';
+ renderBreadcrumb();
+
+ const output = document.getElementById('output');
+ output.innerHTML = '
Loading TLDs...
';
+
+ try {
+ const response = await fetch('/api/tlds');
+ const tlds = await response.json();
+
+ if (!tlds || tlds.length === 0) {
+ output.innerHTML = '
No TLDs found
';
+ return;
+ }
+
+ let html = '
';
+ tlds.forEach(t => html += renderTldRow(t));
+ html += '
';
+ output.innerHTML = html;
+ attachTldHandlers(output.querySelector('.tld-list'));
+ } catch (err) {
+ output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
';
+ }
+ }
+
+ // Execute current filters
+ async function executeFilters() {
+ clearInfiniteScroll();
+ updateCommandInput();
+ renderBreadcrumb();
+
+ const output = document.getElementById('output');
+
+ // Determine what to show
+ const showFeeds = currentFilters.feedStatus || currentFilters.domain;
+ const showDomains = currentFilters.domainStatus || (!showFeeds && currentFilters.tld);
+
+ if (!currentFilters.tld && !currentFilters.domain && !currentFilters.feedStatus && !currentFilters.domainStatus) {
+ showHelp();
+ return;
}
- // Toggle current
- if (feedsDiv.style.display === 'none') {
- feedsDiv.style.display = 'block';
- feedsDiv.innerHTML = '
Loading feeds...
';
- expandedDomain = rowEl;
+ // Build API URL
+ const params = new URLSearchParams();
+ if (currentFilters.tld) params.set('tld', currentFilters.tld);
+ if (currentFilters.domain) params.set('domain', currentFilters.domain);
+ if (currentFilters.feedStatus) params.set('feedStatus', currentFilters.feedStatus);
+ if (currentFilters.domainStatus) params.set('domainStatus', currentFilters.domainStatus);
+ if (showFeeds) params.set('show', 'feeds');
+ else if (showDomains) params.set('show', 'domains');
+ output.innerHTML = '
Loading...
';
+
+ let offset = 0;
+ const limit = 100;
+
+ async function loadMore() {
try {
- const response = await fetch('/api/domainFeeds?host=' + encodeURIComponent(host));
- const feeds = await response.json();
+ params.set('limit', limit);
+ params.set('offset', offset);
+ const response = await fetch('/api/filter?' + params.toString());
+ const result = await response.json();
- if (!feeds || feeds.length === 0) {
- feedsDiv.innerHTML = '
No feeds found
';
+ if (!result.data || result.data.length === 0) {
+ infiniteScrollState.ended = true;
+ document.getElementById('infiniteLoader').textContent = offset === 0 ? 'No results found' : 'End of list';
+ return;
+ }
+
+ const container = output.querySelector('.result-list');
+
+ if (result.type === 'domains') {
+ result.data.forEach(d => container.insertAdjacentHTML('beforeend', renderDomainRow(d)));
+ attachDomainHandlers(container);
} else {
- feedsDiv.innerHTML = '';
- feeds.forEach(f => {
- const feedItem = document.createElement('div');
- feedItem.className = 'feed-item';
- feedItem.style.cssText = 'padding: 5px 10px; border-top: 1px solid #333; cursor: pointer;';
- feedItem.innerHTML =
- '' +
- '
';
+ result.data.forEach(f => container.insertAdjacentHTML('beforeend', renderFeedRow(f)));
+ attachFeedHandlers(container);
+ }
- feedItem.querySelector('.feed-header').addEventListener('click', (e) => {
- e.stopPropagation();
- toggleFeedInfo(f.url, feedItem);
- });
- feedsDiv.appendChild(feedItem);
- });
+ offset += result.data.length;
+
+ if (result.data.length < limit) {
+ infiniteScrollState.ended = true;
+ document.getElementById('infiniteLoader').textContent = 'End of list';
}
} catch (err) {
- feedsDiv.innerHTML = '
Error loading feeds
';
+ document.getElementById('infiniteLoader').textContent = 'Error loading';
}
- } else {
- feedsDiv.style.display = 'none';
- expandedDomain = null;
}
+
+ await loadMore();
+ setupInfiniteScroll(loadMore);
}
- async function toggleFeedInfo(feedUrl, feedItemEl) {
- const detailsDiv = feedItemEl.querySelector('.feed-details');
+ // Show feed info
+ async function showFeedInfo(feedUrl) {
+ clearInfiniteScroll();
+ renderBreadcrumb();
- // Close previously expanded feed
- if (expandedFeed && expandedFeed !== feedItemEl) {
- expandedFeed.querySelector('.feed-details').style.display = 'none';
- }
-
- // Toggle current
- if (detailsDiv.style.display === 'none') {
- detailsDiv.style.display = 'block';
- detailsDiv.innerHTML = '
Loading feed info...
';
- expandedFeed = feedItemEl;
-
- // Scroll the feed item to the top of the viewport
- feedItemEl.scrollIntoView({ behavior: 'smooth', block: 'start' });
-
- try {
- // Fetch feed info and items in parallel
- const [infoResponse, itemsResponse] = await Promise.all([
- fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
- fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=50')
- ]);
- const info = await infoResponse.json();
- const items = await itemsResponse.json();
-
- let html = '
';
-
- if (info.description) {
- html += '
' + escapeHtml(info.description) + '
';
- }
-
- html += '
';
-
- if (info.siteUrl) {
- html += 'Site ' + escapeHtml(info.siteUrl) + ' ';
- }
- if (info.language) {
- html += 'Language ' + escapeHtml(info.language) + ' ';
- }
- if (info.status) {
- html += 'Status ' + escapeHtml(info.status) + ' ';
- }
- if (info.itemCount) {
- html += 'Items ' + commaFormat(info.itemCount) + ' ';
- }
- if (info.avgPostFreqHrs) {
- html += 'Avg Post Freq ' + info.avgPostFreqHrs.toFixed(1) + ' hrs ';
- }
- if (info.ttlMinutes) {
- html += 'TTL ' + info.ttlMinutes + ' min ';
- }
- if (info.updatePeriod) {
- let updateStr = info.updatePeriod;
- if (info.updateFreq) updateStr += ' (' + info.updateFreq + ')';
- html += 'Update ' + escapeHtml(updateStr) + ' ';
- }
- if (info.lastBuildDate) {
- html += 'Last Build ' + escapeHtml(info.lastBuildDate) + ' ';
- }
- if (info.newestItemDate) {
- html += 'Newest Item ' + escapeHtml(info.newestItemDate) + ' ';
- }
- if (info.oldestItemDate) {
- html += 'Oldest Item ' + escapeHtml(info.oldestItemDate) + ' ';
- }
- if (info.discoveredAt) {
- html += 'Discovered ' + escapeHtml(info.discoveredAt) + ' ';
- }
- if (info.lastCrawledAt) {
- html += 'Last Crawled ' + escapeHtml(info.lastCrawledAt) + ' ';
- }
- if (info.errorCount > 0) {
- html += 'Errors ' + info.errorCount + ' ';
- }
- if (info.lastError) {
- html += 'Last Error ' + escapeHtml(info.lastError) + ' ';
- }
-
- html += '
';
-
- // Display items
- if (items && items.length > 0) {
- html += '
';
- html += '
Recent Items (' + items.length + ')
';
-
- items.forEach(item => {
- html += '
';
-
- // Title with link
- if (item.title) {
- if (item.link) {
- html += '
';
- } else {
- html += '
' + escapeHtml(item.title) + '
';
- }
- } else if (item.link) {
- html += '
';
- }
-
- // Metadata line (date, author)
- let meta = [];
- if (item.pub_date) {
- const date = new Date(item.pub_date);
- meta.push(date.toLocaleDateString() + ' ' + date.toLocaleTimeString());
- }
- if (item.author) {
- meta.push(escapeHtml(item.author));
- }
- if (meta.length > 0) {
- html += '
' + meta.join(' • ') + '
';
- }
-
- html += '
';
- });
-
- html += '
';
- }
-
- html += '
';
-
- detailsDiv.innerHTML = html;
- } catch (err) {
- detailsDiv.innerHTML = '
Error loading feed info
';
- }
- } else {
- detailsDiv.style.display = 'none';
- expandedFeed = null;
- }
- }
-
- // Infinite scroll handler with prefetch (uses window scroll)
- function setupInfiniteScroll() {
- window.addEventListener('scroll', () => {
- // Check if we're near the bottom of the page
- const scrollBottom = window.scrollY + window.innerHeight;
- const docHeight = document.documentElement.scrollHeight;
- const remainingPixels = docHeight - scrollBottom;
-
- // Prefetch when within 500px of the bottom
- if (remainingPixels < 500) {
- loadMoreDomains();
- }
- });
- }
-
- // Search functionality
- function setupSearch() {
- const searchInput = document.getElementById('searchInput');
- const searchResults = document.getElementById('searchResults');
- const domainsContainer = document.getElementById('allDomainsContainer');
-
- if (!searchInput || !searchResults || !domainsContainer) {
- console.error('Search elements not found');
- return;
- }
-
- searchInput.addEventListener('input', (e) => {
- const query = e.target.value.trim();
-
- // Clear previous timeout
- if (searchTimeout) {
- clearTimeout(searchTimeout);
- }
-
- // If empty, show domains list
- if (!query) {
- searchResults.style.display = 'none';
- domainsContainer.style.display = 'block';
- isSearching = false;
- return;
- }
-
- // Debounce search
- searchTimeout = setTimeout(() => performSearch(query), 300);
- });
-
- // Handle Enter key
- searchInput.addEventListener('keydown', (e) => {
- if (e.key === 'Enter') {
- const query = e.target.value.trim();
- if (query) {
- if (searchTimeout) clearTimeout(searchTimeout);
- performSearch(query);
- }
- }
- });
- }
-
- async function performSearch(query) {
- const searchResults = document.getElementById('searchResults');
- const domainsContainer = document.getElementById('allDomainsContainer');
-
- isSearching = true;
- domainsContainer.style.display = 'none';
- searchResults.style.display = 'block';
- searchResults.innerHTML = '
Searching...
';
-
- try {
- const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=200');
- const results = await response.json();
-
- if (!results || results.length === 0) {
- searchResults.innerHTML = '
No results found
';
- return;
- }
-
- // Group results by host
- const byHost = {};
- results.forEach(r => {
- const host = r.feed.source_host || 'unknown';
- if (!byHost[host]) {
- byHost[host] = [];
- }
- byHost[host].push(r);
- });
-
- // Render results
- searchResults.innerHTML = '';
-
- Object.keys(byHost).sort().forEach(host => {
- const hostDiv = document.createElement('div');
- hostDiv.className = 'search-host';
-
- // Host header
- const hostHeader = document.createElement('div');
- hostHeader.className = 'stat-row';
- hostHeader.style.cssText = 'cursor: pointer; background: #1a1a1a; padding: 8px; margin-bottom: 2px;';
- hostHeader.innerHTML = '
' + escapeHtml(host) + ' ' + byHost[host].length + ' feed(s) ';
-
- const feedsContainer = document.createElement('div');
- feedsContainer.style.display = 'block';
-
- byHost[host].forEach(result => {
- const feedDiv = document.createElement('div');
- feedDiv.className = 'search-feed';
- feedDiv.style.cssText = 'padding: 8px 8px 8px 20px; border-bottom: 1px solid #222;';
-
- // Feed header
- let feedHtml = '
' + escapeHtml(result.feed.url) + '
';
- if (result.feed.title) {
- feedHtml += '
' + escapeHtml(result.feed.title) + '
';
- }
- if (result.feed.description) {
- feedHtml += '
' + escapeHtml(result.feed.description.substring(0, 200)) + '
';
- }
-
- // Items
- if (result.items && result.items.length > 0) {
- feedHtml += '
';
- result.items.forEach(item => {
- feedHtml += '
';
- if (item.title) {
- if (item.link) {
- feedHtml += '
' + escapeHtml(item.title) + ' ';
- } else {
- feedHtml += '
' + escapeHtml(item.title) + ' ';
- }
- }
- let meta = [];
- if (item.pub_date) {
- meta.push(item.pub_date.substring(0, 10));
- }
- if (item.author) {
- meta.push(escapeHtml(item.author));
- }
- if (meta.length > 0) {
- feedHtml += '
' + meta.join(' • ') + '
';
- }
- feedHtml += '
';
- });
- feedHtml += '
';
- }
-
- feedDiv.innerHTML = feedHtml;
-
- // Click on feed URL to toggle full feed info
- feedDiv.querySelector('.feed-url').addEventListener('click', () => {
- toggleSearchFeedInfo(result.feed.url, feedDiv);
- });
-
- feedsContainer.appendChild(feedDiv);
- });
-
- hostHeader.addEventListener('click', () => {
- feedsContainer.style.display = feedsContainer.style.display === 'none' ? 'block' : 'none';
- });
-
- hostDiv.appendChild(hostHeader);
- hostDiv.appendChild(feedsContainer);
- searchResults.appendChild(hostDiv);
- });
-
- } catch (err) {
- console.error('Search failed:', err);
- searchResults.innerHTML = '
Search failed: ' + escapeHtml(err.message) + '
';
- }
- }
-
- async function toggleSearchFeedInfo(feedUrl, feedDiv) {
- let detailsDiv = feedDiv.querySelector('.feed-details-expanded');
-
- if (detailsDiv) {
- detailsDiv.remove();
- return;
- }
-
- detailsDiv = document.createElement('div');
- detailsDiv.className = 'feed-details-expanded';
- detailsDiv.style.cssText = 'padding: 10px; background: #111; margin-top: 8px; border-radius: 4px;';
- detailsDiv.innerHTML = '
Loading feed info...
';
- feedDiv.appendChild(detailsDiv);
+ const output = document.getElementById('output');
+ output.innerHTML = '
Loading feed info...
';
try {
const [infoResponse, itemsResponse] = await Promise.all([
fetch('/api/feedInfo?url=' + encodeURIComponent(feedUrl)),
- fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=20')
+ fetch('/api/feedItems?url=' + encodeURIComponent(feedUrl) + '&limit=100')
]);
const info = await infoResponse.json();
const items = await itemsResponse.json();
- let html = '
';
- if (info.siteUrl) html += 'Site ' + escapeHtml(info.siteUrl) + ' ';
- if (info.language) html += 'Language ' + escapeHtml(info.language) + ' ';
- if (info.status) html += 'Status ' + escapeHtml(info.status) + ' ';
- if (info.itemCount) html += 'Items ' + commaFormat(info.itemCount) + ' ';
- if (info.avgPostFreqHrs) html += 'Avg Freq ' + info.avgPostFreqHrs.toFixed(1) + ' hrs ';
- if (info.newestItemDate) html += 'Newest ' + escapeHtml(info.newestItemDate) + ' ';
+ let html = '';
+ html += '
' + escapeHtml(feedUrl) + '
';
+ if (info.title) html += '
' + escapeHtml(info.title) + '
';
+ if (info.description) html += '
' + escapeHtml(info.description) + '
';
+
+ html += '
';
+ const addRow = (label, value, color) => value ? '' + label + ' ' + escapeHtml(String(value)) + ' ' : '';
+ html += addRow('Type', info.type);
+ html += addRow('Status', info.status, info.status === 'active' ? '#0a0' : '#f66');
+ html += addRow('Language', info.language);
+ html += addRow('Site URL', info.siteUrl);
+ html += addRow('Items', info.itemCount ? commaFormat(info.itemCount) : null);
+ html += addRow('Avg Post Freq', info.avgPostFreqHrs ? info.avgPostFreqHrs.toFixed(1) + ' hrs' : null);
+ html += addRow('Discovered', info.discoveredAt);
+ html += addRow('Last Crawled', info.lastCrawledAt);
+ if (info.errorCount > 0) {
+ html += addRow('Errors', info.errorCount, '#f66');
+ html += addRow('Last Error', info.lastError, '#f66');
+ }
html += '
';
if (items && items.length > 0) {
- html += '
';
- html += '
All Items (' + items.length + ')
';
+ html += '
';
+ html += '
Recent Items (' + items.length + ')
';
items.forEach(item => {
- html += '
';
+ html += '
';
if (item.title && item.link) {
- html += '
' + escapeHtml(item.title) + ' ';
+ html += '
';
} else if (item.title) {
- html += '
' + escapeHtml(item.title) + ' ';
+ html += '
' + escapeHtml(item.title) + '
';
}
+ let meta = [];
+ if (item.pub_date) meta.push(new Date(item.pub_date).toLocaleDateString() + ' ' + new Date(item.pub_date).toLocaleTimeString());
+ if (item.author) meta.push(escapeHtml(item.author));
+ if (meta.length > 0) html += '
' + meta.join(' • ') + '
';
html += '
';
});
html += '
';
}
-
- detailsDiv.innerHTML = html;
+ html += '
';
+ output.innerHTML = html;
} catch (err) {
- detailsDiv.innerHTML = '
Failed to load feed info
';
+ output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
';
}
}
+ // Highlight matching text
+ function highlightMatch(text, query) {
+ if (!text || !query) return escapeHtml(text);
+ const escaped = escapeHtml(text);
+ const regex = new RegExp('(' + query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + ')', 'gi');
+ return escaped.replace(regex, '
$1 ');
+ }
+
+ // Perform search
+ async function performSearch(query) {
+ currentFilters = {};
+ clearInfiniteScroll();
+ updateCommandInput();
+ document.getElementById('commandInput').value = query;
+ renderBreadcrumb();
+
+ const output = document.getElementById('output');
+ output.innerHTML = '
Searching...
';
+
+ try {
+ const response = await fetch('/api/search?q=' + encodeURIComponent(query) + '&limit=500');
+ const results = await response.json();
+
+ if (!results || results.length === 0) {
+ output.innerHTML = '
No results found
';
+ return;
+ }
+
+ let html = '
';
+ html += '
' + results.length + ' feed(s) found
';
+
+ results.forEach(r => {
+ const f = r.feed;
+ html += '
';
+ html += '
';
+ html += '
' + highlightMatch(f.url, query) + ' ';
+ html += '
↗ ';
+ html += '
';
+ if (f.title) html += '
' + highlightMatch(f.title, query) + '
';
+ if (f.source_host) {
+ html += '
from ' + highlightMatch(f.source_host, query) + '
';
+ }
+ let statusParts = [];
+ if (f.type) statusParts.push(f.type);
+ if (f.status) statusParts.push('
' + f.status + ' ');
+ if (f.item_count > 0) statusParts.push(commaFormat(f.item_count) + ' items');
+ if (statusParts.length > 0) html += '
' + statusParts.join(' · ') + '
';
+
+ if (r.items && r.items.length > 0) {
+ html += '
';
+ html += '
' + r.items.length + ' matching item(s)
';
+ r.items.forEach(item => {
+ html += '
';
+ if (item.title && item.link) html += '
' + highlightMatch(item.title, query) + ' ';
+ else if (item.title) html += '
' + highlightMatch(item.title, query) + ' ';
+ if (item.pub_date) html += '
' + item.pub_date.substring(0, 10) + ' ';
+ if (item.description) html += '
' + highlightMatch(item.description.substring(0, 300), query) + '
';
+ html += '
';
+ });
+ html += '
';
+ }
+ html += '
';
+ });
+ html += '
';
+ output.innerHTML = html;
+
+ output.querySelectorAll('.search-feed-url').forEach(el => {
+ el.addEventListener('click', () => showFeedInfo(el.dataset.url));
+ });
+ } catch (err) {
+ output.innerHTML = '
Search error: ' + escapeHtml(err.message) + '
';
+ }
+ }
+
+ // Show publish management view
+ async function showPublish() {
+ currentFilters = {};
+ clearInfiniteScroll();
+ updateCommandInput();
+ document.getElementById('commandInput').value = '/publish';
+ renderBreadcrumb();
+
+ const output = document.getElementById('output');
+ output.innerHTML = '
Loading publish data...
';
+
+ try {
+ const [candidatesRes, passedRes] = await Promise.all([
+ fetch('/api/publishCandidates?limit=50'),
+ fetch('/api/publishEnabled')
+ ]);
+ const candidates = await candidatesRes.json();
+ const passed = await passedRes.json();
+
+ let html = '
';
+
+ // Passed feeds (approved for publishing)
+ html += '
';
+ html += '
✓ Approved for Publishing (' + passed.length + ')
';
+ if (passed.length === 0) {
+ html += '
No feeds approved yet
';
+ } else {
+ passed.forEach(f => {
+ html += '
';
+ html += '
';
+ html += '
' + escapeHtml(f.title || f.url) + '
';
+ html += '
' + escapeHtml(f.url) + '
';
+ html += '
→ ' + escapeHtml(f.account) + ' (' + f.unpublished_count + ' unpublished)
';
+ html += '
';
+ html += '
Revoke ';
+ html += '
';
+ });
+ }
+ html += '
';
+
+ // Candidates (held for review)
+ html += '
';
+ html += '
⏳ Held for Review (' + candidates.length + ')
';
+ if (candidates.length === 0) {
+ html += '
No candidates held
';
+ } else {
+ candidates.forEach(f => {
+ html += '
';
+ html += '
';
+ html += '
';
+ html += '
' + escapeHtml(f.title || f.url) + '
';
+ html += '
' + escapeHtml(f.url) + '
';
+ html += '
→ ' + escapeHtml(f.derived_handle) + '
';
+ html += '
' + escapeHtml(f.source_host) + ' · ' + f.item_count + ' items · ' + escapeHtml(f.category) + '
';
+ html += '
';
+ html += '
Pass ';
+ html += '
Fail ';
+ html += '
';
+ html += '
';
+ });
+ }
+ html += '
';
+
+ html += '
';
+ output.innerHTML = html;
+
+ // Attach handlers for pass/fail buttons
+ output.querySelectorAll('.status-btn').forEach(btn => {
+ btn.addEventListener('click', async () => {
+ const url = btn.dataset.url;
+ const status = btn.dataset.status;
+ btn.disabled = true;
+ btn.textContent = '...';
+ try {
+ const response = await fetch('/api/setPublishStatus?url=' + encodeURIComponent(url) + '&status=' + status);
+ if (response.ok) {
+ // Refresh the view
+ showPublish();
+ } else {
+ btn.textContent = 'Error';
+ btn.style.background = '#600';
+ }
+ } catch (err) {
+ btn.textContent = 'Error';
+ btn.style.background = '#600';
+ }
+ });
+ });
+
+ } catch (err) {
+ output.innerHTML = '
Error: ' + escapeHtml(err.message) + '
';
+ }
+ }
+
+ // Process command
+ function processCommand(cmd) {
+ const trimmed = cmd.trim();
+ if (!trimmed || trimmed === '/help') {
+ showHelp();
+ return;
+ }
+ if (trimmed === '/tlds' || trimmed === '/tld') {
+ showTLDs();
+ return;
+ }
+ if (trimmed === '/publish') {
+ showPublish();
+ return;
+ }
+
+ // Check if it looks like a filter command
+ const hasFilter = trimmed.includes(':') || ['active', 'error', 'dead', 'unchecked', 'checked'].some(s => trimmed.toLowerCase().includes(s));
+
+ if (hasFilter) {
+ currentFilters = parseCommand(trimmed);
+ executeFilters();
+ } else {
+ // Treat as search
+ performSearch(trimmed);
+ }
+ }
+
+ // Setup command input
+ function setupCommandInput() {
+ const input = document.getElementById('commandInput');
+ input.addEventListener('keydown', (e) => { if (e.key === 'Enter') processCommand(input.value); });
+ input.addEventListener('focus', () => input.select());
+ document.querySelectorAll('.cmd-btn').forEach(btn => {
+ btn.addEventListener('click', () => {
+ const cmd = btn.dataset.cmd;
+ // Special commands that reset filters
+ if (cmd === '/tlds' || cmd === '/publish') {
+ currentFilters = {};
+ input.value = cmd;
+ processCommand(cmd);
+ return;
+ }
+ // Status buttons add to current filters
+ const btnFilters = parseCommand(cmd);
+ if (btnFilters.domainStatus) {
+ currentFilters.domainStatus = btnFilters.domainStatus;
+ delete currentFilters.feedStatus; // Can't have both
+ delete currentFilters.domain; // Show domains, not feeds for a domain
+ }
+ if (btnFilters.feedStatus) {
+ currentFilters.feedStatus = btnFilters.feedStatus;
+ delete currentFilters.domainStatus; // Can't have both
+ }
+ updateCommandInput();
+ executeFilters();
+ });
+ });
+ }
+
+ // Stats update
async function updateStats() {
try {
const response = await fetch('/api/stats');
const stats = await response.json();
-
- // Update domain stats
document.getElementById('totalDomains').textContent = commaFormat(stats.total_domains);
document.getElementById('checkedDomains').textContent = commaFormat(stats.checked_domains);
document.getElementById('uncheckedDomains').textContent = commaFormat(stats.unchecked_domains);
document.getElementById('crawlRate').textContent = commaFormat(stats.crawl_rate);
document.getElementById('checkRate').textContent = commaFormat(stats.check_rate);
-
- // Update progress bar
- const progress = stats.total_domains > 0
- ? (stats.checked_domains * 100 / stats.total_domains).toFixed(1)
- : 0;
+ const progress = stats.total_domains > 0 ? (stats.checked_domains * 100 / stats.total_domains).toFixed(1) : 0;
document.getElementById('crawlProgress').style.width = progress + '%';
-
- // Update feed stats
document.getElementById('totalFeeds').textContent = commaFormat(stats.total_feeds);
document.getElementById('rssFeeds').textContent = commaFormat(stats.rss_feeds);
document.getElementById('atomFeeds').textContent = commaFormat(stats.atom_feeds);
document.getElementById('unknownFeeds').textContent = commaFormat(stats.unknown_feeds);
-
- // Update timestamp
const updatedAt = new Date(stats.updated_at);
- document.getElementById('updatedAt').textContent = 'Last updated: ' +
- updatedAt.toISOString().replace('T', ' ').substring(0, 19);
-
+ document.getElementById('updatedAt').textContent = 'Last updated: ' + updatedAt.toISOString().replace('T', ' ').substring(0, 19);
} catch (err) {
console.error('Failed to update stats:', err);
}
}
// Initialize
- try {
- setupSearch();
- } catch (e) {
- console.error('setupSearch failed:', e);
- }
- setupInfiniteScroll();
- loadMoreDomains();
+ setupCommandInput();
+ showHelp();
updateStats();
setInterval(updateStats, 1000);
}