Dashboard UI overhaul: inline feed details, TLD filtering, status improvements

- Feed details now expand inline instead of navigating to new page
- Add TLD section headers with domains sorted by TLD then name
- Add TLD filter button to show/hide domain sections by TLD
- Feed status behavior: pass creates account, hold crawls only, skip stops, drop cleans up
- Auto-follow new accounts from directory account (1440.news)
- Fix handle derivation (removed duplicate .1440.news suffix)
- Increase domain import batch size to 100k
- Various bug fixes for account creation and profile updates

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-29 20:51:05 -05:00
parent 5908a8c03e
commit 3999e96f26
11 changed files with 2603 additions and 1178 deletions
+23 -12
View File
@@ -45,10 +45,11 @@ Multi-file Go application that crawls websites for RSS/Atom feeds, stores them i
### Concurrent Loops (main.go)
The application runs six independent goroutine loops:
- **Import loop** - Reads `vertices.txt.gz` and inserts domains into DB in 10k batches
- **Crawl loop** - Worker pool processes unchecked domains, discovers feeds
- **Check loop** - Worker pool re-checks known feeds for updates (conditional HTTP)
The application runs seven independent goroutine loops:
- **Import loop** - Reads `vertices.txt.gz` and inserts domains into DB in 10k batches (status='hold')
- **Domain check loop** - HEAD requests to verify approved domains are reachable
- **Crawl loop** - Worker pool crawls verified domains for feed discovery
- **Feed check loop** - Worker pool re-checks known feeds for updates (conditional HTTP)
- **Stats loop** - Updates cached dashboard statistics every minute
- **Cleanup loop** - Removes items older than 12 months (weekly)
- **Publish loop** - Autopublishes items from approved feeds to AT Protocol PDS
@@ -70,8 +71,8 @@ The application runs six independent goroutine loops:
### Database Schema
PostgreSQL with pgx driver, using connection pooling:
- **domains** - Hosts to crawl (status: unchecked/checked/error)
- **feeds** - Discovered RSS/Atom feeds with metadata and cache headers
- **domains** - Hosts to crawl (status: hold/pass/skip/fail)
- **feeds** - Discovered RSS/Atom feeds with metadata and cache headers (publish_status: hold/pass/skip)
- **items** - Individual feed entries (guid + feed_url unique)
- **search_vector** - GENERATED tsvector columns for full-text search (GIN indexed)
@@ -79,11 +80,12 @@ Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`)
### Crawl Logic
1. Domain picked from `unchecked` status (random order)
2. Try HTTPS, fall back to HTTP
3. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10
4. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
5. Parse discovered feeds for metadata, save with next_crawl_at
1. Domain manually approved (status set to 'pass')
2. Check stage: HEAD request verifies domain is reachable, sets last_checked_at
3. Crawl stage: Full recursive crawl (HTTPS, fallback HTTP)
4. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10
5. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
6. Parse discovered feeds for metadata, save with next_crawl_at
### Feed Checking
@@ -92,7 +94,16 @@ Uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 1
### Publishing
Feeds with `publish_status = 'pass'` have their items automatically posted to AT Protocol.
Status values: `held` (default), `pass` (approved), `deny` (rejected).
Status values: `hold` (default/pending review), `pass` (approved), `skip` (rejected).
### Domain Processing (Two-Stage)
1. **Check stage** - HEAD request to verify domain is reachable
2. **Crawl stage** - Full recursive crawl for feed discovery
Domain status values: `hold` (pending), `pass` (approved), `skip` (rejected), `fail` (error).
Domains starting with a digit (except 1440.news) are auto-skipped.
Non-English feeds are auto-skipped.
## AT Protocol Integration
+128 -33
View File
@@ -213,14 +213,14 @@ func (c *Crawler) StartPublishLoop() {
if displayName == "" {
displayName = account
}
// Build description with feed URL
description := feedInfo.Description
// Build description with feed URL (strip HTML tags)
description := stripHTML(feedInfo.Description)
if description == "" {
description = "News feed via 1440.news"
}
// Add feed URL to description
// Add feed URL as first line of description
feedURLFull := "https://" + item.FeedURL
description = description + "\n\n" + feedURLFull
description = feedURLFull + "\n\n" + description
// Truncate if needed
if len(displayName) > 64 {
displayName = displayName[:61] + "..."
@@ -230,8 +230,13 @@ func (c *Crawler) StartPublishLoop() {
}
// Fetch and upload favicon as avatar
var avatar *BlobRef
if feedInfo.SiteURL != "" {
faviconURL := publisher.FetchFavicon(feedInfo.SiteURL)
faviconSource := feedInfo.SiteURL
if faviconSource == "" {
// Fallback to deriving from feed URL
faviconSource = feedInfo.SourceHost
}
if faviconSource != "" {
faviconURL := publisher.FetchFavicon(faviconSource)
if faviconURL != "" {
avatar = publisher.fetchAndUploadImage(session, faviconURL)
}
@@ -241,6 +246,13 @@ func (c *Crawler) StartPublishLoop() {
} else {
fmt.Printf("Publish: set profile for %s\n", account)
}
// Have directory account follow this new account
if err := publisher.FollowAsDirectory(session.DID); err != nil {
fmt.Printf("Publish: directory follow failed for %s: %v\n", account, err)
} else {
fmt.Printf("Publish: directory now following %s\n", account)
}
}
}
sessions[account] = session
@@ -256,15 +268,6 @@ func (c *Crawler) StartPublishLoop() {
fmt.Printf("Publish: short URL failed for %s: %v\n", item.Link[:min(40, len(item.Link))], err)
}
}
if item.Enclosure != nil && item.Enclosure.URL != "" {
if shortURL, err := c.GetShortURLForPost(item.Enclosure.URL, &item.ID, item.FeedURL); err == nil {
itemToPublish.Enclosure = &Enclosure{
URL: shortURL,
Type: item.Enclosure.Type,
Length: item.Enclosure.Length,
}
}
}
// Publish the item
uri, err := publisher.PublishItem(session, &itemToPublish)
@@ -305,14 +308,15 @@ type FeedInfo struct {
Title string
Description string
SiteURL string
SourceHost string
}
// getFeedInfo returns feed metadata for profile setup
func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
var title, description, siteURL *string
var title, description, siteURL, sourceHost *string
err := c.db.QueryRow(`
SELECT title, description, site_url FROM feeds WHERE url = $1
`, feedURL).Scan(&title, &description, &siteURL)
SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
if err != nil {
return nil
}
@@ -320,13 +324,14 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
Title: StringValue(title),
Description: StringValue(description),
SiteURL: StringValue(siteURL),
SourceHost: StringValue(sourceHost),
}
}
// RefreshAllProfiles updates profiles for all existing accounts with feed URLs
func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
rows, err := c.db.Query(`
SELECT url, title, description, site_url, publish_account
SELECT url, title, description, site_url, source_host, publish_account
FROM feeds
WHERE publish_account IS NOT NULL AND publish_account <> ''
`)
@@ -338,8 +343,8 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
for rows.Next() {
var feedURL, account string
var title, description, siteURL *string
if err := rows.Scan(&feedURL, &title, &description, &siteURL, &account); err != nil {
var title, description, siteURL, sourceHost *string
if err := rows.Scan(&feedURL, &title, &description, &siteURL, &sourceHost, &account); err != nil {
continue
}
@@ -355,13 +360,13 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
if displayName == "" {
displayName = account
}
desc := StringValue(description)
desc := stripHTML(StringValue(description))
if desc == "" {
desc = "News feed via 1440.news"
}
// Add feed URL
// Add feed URL as first line
feedURLFull := "https://" + feedURL
desc = desc + "\n\n" + feedURLFull
desc = feedURLFull + "\n\n" + desc
// Truncate if needed
if len(displayName) > 64 {
@@ -373,8 +378,13 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
// Fetch and upload favicon as avatar
var avatar *BlobRef
if siteURL != nil && *siteURL != "" {
faviconURL := publisher.FetchFavicon(*siteURL)
faviconSource := StringValue(siteURL)
if faviconSource == "" {
// Fallback to source host
faviconSource = StringValue(sourceHost)
}
if faviconSource != "" {
faviconURL := publisher.FetchFavicon(faviconSource)
if faviconURL != "" {
avatar = publisher.fetchAndUploadImage(session, faviconURL)
}
@@ -392,7 +402,7 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
rows, err := c.db.Query(`
SELECT i.id, i.feed_url, i.guid, i.title, i.link, i.description, i.content,
i.author, i.pub_date, i.discovered_at, i.image_urls,
i.author, i.pub_date, i.discovered_at, i.image_urls, i.tags,
i.enclosure_url, i.enclosure_type, i.enclosure_length
FROM items i
JOIN feeds f ON i.feed_url = f.url
@@ -410,13 +420,13 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
var items []Item
for rows.Next() {
var item Item
var guid, title, link, description, content, author, imageURLsJSON *string
var guid, title, link, description, content, author, imageURLsJSON, tagsJSON *string
var pubDate, discoveredAt *time.Time
var enclosureURL, enclosureType *string
var enclosureLength *int64
err := rows.Scan(&item.ID, &item.FeedURL, &guid, &title, &link, &description,
&content, &author, &pubDate, &discoveredAt, &imageURLsJSON,
&content, &author, &pubDate, &discoveredAt, &imageURLsJSON, &tagsJSON,
&enclosureURL, &enclosureType, &enclosureLength)
if err != nil {
continue
@@ -436,6 +446,11 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
json.Unmarshal([]byte(*imageURLsJSON), &item.ImageURLs)
}
// Parse tags from JSON array
if tagsJSON != nil && *tagsJSON != "" {
json.Unmarshal([]byte(*tagsJSON), &item.Tags)
}
// Parse enclosure
if enclosureURL != nil && *enclosureURL != "" {
item.Enclosure = &Enclosure{
@@ -453,7 +468,87 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
return items, nil
}
// StartCrawlLoop runs the domain crawling loop independently
// StartDomainCheckLoop runs HEAD requests on approved domains to verify they're reachable
func (c *Crawler) StartDomainCheckLoop() {
numWorkers := runtime.NumCPU()
if numWorkers < 1 {
numWorkers = 1
}
// Buffered channel for domain work
workChan := make(chan *Domain, 256)
// Start workers
for i := 0; i < numWorkers; i++ {
go func() {
for domain := range workChan {
// Do HEAD request to verify domain is reachable
checkErr := c.checkDomain(domain.Host)
errStr := ""
if checkErr != nil {
errStr = checkErr.Error()
}
if err := c.markDomainChecked(domain.Host, errStr); err != nil {
fmt.Printf("Error marking domain %s as checked: %v\n", domain.Host, err)
}
}
}()
}
const fetchSize = 1000
for {
domains, err := c.GetDomainsToCheck(fetchSize)
if err != nil {
fmt.Printf("Error fetching domains to check: %v\n", err)
}
if len(domains) == 0 {
time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s domain-check: %d domains to verify\n", time.Now().Format("15:04:05"), len(domains))
for _, domain := range domains {
workChan <- domain
}
time.Sleep(1 * time.Second)
}
}
// checkDomain performs a HEAD request to verify a domain is reachable
func (c *Crawler) checkDomain(host string) error {
url := "https://" + host
req, err := http.NewRequest("HEAD", url, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", c.UserAgent)
resp, err := c.client.Do(req)
if err != nil {
// Try HTTP fallback
url = "http://" + host
req, err = http.NewRequest("HEAD", url, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", c.UserAgent)
resp, err = c.client.Do(req)
if err != nil {
return err
}
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return fmt.Errorf("HTTP %d", resp.StatusCode)
}
return nil
}
// StartCrawlLoop runs the domain crawling loop independently (crawls checked domains)
func (c *Crawler) StartCrawlLoop() {
numWorkers := runtime.NumCPU()
if numWorkers < 1 {
@@ -481,9 +576,9 @@ func (c *Crawler) StartCrawlLoop() {
const fetchSize = 1000
for {
domains, err := c.GetUncheckedDomains(fetchSize)
domains, err := c.GetDomainsToCrawl(fetchSize)
if err != nil {
fmt.Printf("Error fetching domains: %v\n", err)
fmt.Printf("Error fetching domains to crawl: %v\n", err)
}
if len(domains) == 0 {
@@ -492,7 +587,7 @@ func (c *Crawler) StartCrawlLoop() {
continue
}
fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains))
fmt.Printf("%s crawl: %d domains to crawl\n", time.Now().Format("15:04:05"), len(domains))
for _, domain := range domains {
workChan <- domain
+1040 -146
View File
File diff suppressed because it is too large Load Diff
+4 -2
View File
@@ -15,8 +15,9 @@ import (
const schema = `
CREATE TABLE IF NOT EXISTS domains (
host TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'unchecked',
status TEXT NOT NULL DEFAULT 'hold',
discovered_at TIMESTAMPTZ NOT NULL,
last_checked_at TIMESTAMPTZ,
last_crawled_at TIMESTAMPTZ,
feeds_found INTEGER DEFAULT 0,
last_error TEXT,
@@ -65,7 +66,7 @@ CREATE TABLE IF NOT EXISTS feeds (
no_update INTEGER DEFAULT 0,
-- Publishing to PDS
publish_status TEXT DEFAULT 'held' CHECK(publish_status IN ('held', 'pass', 'deny')),
publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')),
publish_account TEXT,
-- Full-text search vector
@@ -106,6 +107,7 @@ CREATE TABLE IF NOT EXISTS items (
enclosure_type TEXT,
enclosure_length BIGINT,
image_urls TEXT, -- JSON array of image URLs
tags TEXT, -- JSON array of category/tag strings
-- Publishing to PDS
published_at TIMESTAMPTZ,
+11 -1
View File
@@ -25,14 +25,24 @@ services:
- "traefik.http.routers.app-1440-news.rule=Host(`app.1440.news`)"
- "traefik.http.routers.app-1440-news.entrypoints=https"
- "traefik.http.routers.app-1440-news.tls.certresolver=letsencrypt-dns"
# Production: HTTPS for 1440.news root (accounts directory) - lower priority than PDS API paths
- "traefik.http.routers.root-1440-news.rule=Host(`1440.news`)"
- "traefik.http.routers.root-1440-news.entrypoints=https"
- "traefik.http.routers.root-1440-news.tls.certresolver=letsencrypt-dns"
- "traefik.http.routers.root-1440-news.priority=10"
# Production: HTTPS for url.1440.news (URL shortener)
- "traefik.http.routers.url-1440-news.rule=Host(`url.1440.news`)"
- "traefik.http.routers.url-1440-news.entrypoints=https"
- "traefik.http.routers.url-1440-news.tls.certresolver=letsencrypt-dns"
# Production: HTTP to HTTPS redirect for both domains
# Production: HTTP to HTTPS redirect for app and url subdomains
- "traefik.http.routers.app-1440-news-redirect.rule=Host(`app.1440.news`) || Host(`url.1440.news`)"
- "traefik.http.routers.app-1440-news-redirect.entrypoints=http"
- "traefik.http.routers.app-1440-news-redirect.middlewares=https-redirect"
# Production: HTTP to HTTPS redirect for 1440.news root
- "traefik.http.routers.root-1440-news-redirect.rule=Host(`1440.news`)"
- "traefik.http.routers.root-1440-news-redirect.entrypoints=http"
- "traefik.http.routers.root-1440-news-redirect.middlewares=https-redirect"
- "traefik.http.routers.root-1440-news-redirect.priority=10"
- "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
- "traefik.http.middlewares.https-redirect.redirectscheme.permanent=true"
# Local development: HTTP only
+100 -63
View File
@@ -15,65 +15,72 @@ import (
)
// Domain represents a host to be crawled for feeds
// Status: hold (pending review), pass (approved), skip (not processing), fail (error)
type Domain struct {
Host string `json:"host"`
Status string `json:"status"`
DiscoveredAt time.Time `json:"discovered_at"`
LastCheckedAt time.Time `json:"last_checked_at,omitempty"`
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
FeedsFound int `json:"feeds_found,omitempty"`
LastError string `json:"last_error,omitempty"`
TLD string `json:"tld,omitempty"`
}
// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns
func shouldAutoDenyDomain(host string) bool {
// Never deny our own domain
// shouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
func shouldAutoSkipDomain(host string) bool {
// Never skip our own domain
if strings.HasSuffix(host, "1440.news") || host == "1440.news" {
return false
}
// Deny domains starting with a digit (spam pattern)
// Skip domains starting with a digit (spam pattern)
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
return true
}
// Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com")
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
return true
}
return false
}
// saveDomain stores a domain in PostgreSQL
func (c *Crawler) saveDomain(domain *Domain) error {
// Auto-deny domains matching spam patterns
// Auto-skip domains matching spam patterns
status := domain.Status
if shouldAutoDenyDomain(domain.Host) {
status = "denied"
if shouldAutoSkipDomain(domain.Host) {
status = "skip"
}
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7)
INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT(host) DO UPDATE SET
status = EXCLUDED.status,
last_checked_at = EXCLUDED.last_checked_at,
last_crawled_at = EXCLUDED.last_crawled_at,
feeds_found = EXCLUDED.feeds_found,
last_error = EXCLUDED.last_error,
tld = EXCLUDED.tld
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err
}
// saveDomainTx stores a domain using a transaction
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
// Auto-deny domains matching spam patterns
// Auto-skip domains matching spam patterns
status := domain.Status
if shouldAutoDenyDomain(domain.Host) {
status = "denied"
if shouldAutoSkipDomain(domain.Host) {
status = "skip"
}
_, err := tx.Exec(context.Background(), `
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7)
INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT(host) DO NOTHING
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err
}
@@ -87,14 +94,14 @@ func (c *Crawler) domainExists(host string) bool {
// getDomain retrieves a domain from PostgreSQL
func (c *Crawler) getDomain(host string) (*Domain, error) {
domain := &Domain{}
var lastCrawledAt *time.Time
var lastCheckedAt, lastCrawledAt *time.Time
var lastError *string
err := c.db.QueryRow(`
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE host = $1
`, normalizeHost(host)).Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
)
@@ -105,17 +112,34 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
return nil, err
}
domain.LastCheckedAt = TimeValue(lastCheckedAt)
domain.LastCrawledAt = TimeValue(lastCrawledAt)
domain.LastError = StringValue(lastError)
return domain, nil
}
// GetUncheckedDomains returns up to limit unchecked domains ordered by discovered_at (FIFO)
func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
// GetDomainsToCheck returns domains ready for checking (status='pass', never checked)
func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'unchecked'
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'pass' AND last_checked_at IS NULL
ORDER BY discovered_at ASC
LIMIT $1
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return c.scanDomains(rows)
}
// GetDomainsToCrawl returns domains ready for crawling (status='pass', checked but not crawled)
func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'pass' AND last_checked_at IS NOT NULL AND last_crawled_at IS NULL
ORDER BY discovered_at ASC
LIMIT $1
`, limit)
@@ -132,16 +156,17 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
var domains []*Domain
for rows.Next() {
domain := &Domain{}
var lastCrawledAt *time.Time
var lastCheckedAt, lastCrawledAt *time.Time
var lastError *string
if err := rows.Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
); err != nil {
continue
}
domain.LastCheckedAt = TimeValue(lastCheckedAt)
domain.LastCrawledAt = TimeValue(lastCrawledAt)
domain.LastError = StringValue(lastError)
@@ -151,36 +176,48 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
return domains, rows.Err()
}
// markDomainCrawled updates a domain's status after crawling
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
status := "checked"
// markDomainChecked updates a domain after the check (HEAD request) stage
func (c *Crawler) markDomainChecked(host string, lastError string) error {
now := time.Now()
if lastError != "" {
status = "error"
}
var err error
if lastError != "" {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = $4
WHERE host = $5
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = NULL
WHERE host = $4
`, status, time.Now(), feedsFound, normalizeHost(host))
_, err := c.db.Exec(`
UPDATE domains SET status = 'fail', last_checked_at = $1, last_error = $2
WHERE host = $3
`, now, lastError, normalizeHost(host))
return err
}
_, err := c.db.Exec(`
UPDATE domains SET last_checked_at = $1, last_error = NULL
WHERE host = $2
`, now, normalizeHost(host))
return err
}
// GetDomainCount returns the total number of domains in the database
func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
// markDomainCrawled updates a domain after the crawl stage
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
now := time.Now()
if lastError != "" {
_, err := c.db.Exec(`
UPDATE domains SET status = 'fail', last_crawled_at = $1, feeds_found = $2, last_error = $3
WHERE host = $4
`, now, feedsFound, lastError, normalizeHost(host))
return err
}
_, err := c.db.Exec(`
UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = NULL
WHERE host = $3
`, now, feedsFound, normalizeHost(host))
return err
}
// GetDomainCount returns the total number of domains and counts by status
func (c *Crawler) GetDomainCount() (total int, hold int, err error) {
err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total)
if err != nil {
return 0, 0, err
}
err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked)
return total, unchecked, err
err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'hold'").Scan(&hold)
return total, hold, err
}
// ImportTestDomains adds a list of specific domains for testing
@@ -189,7 +226,7 @@ func (c *Crawler) ImportTestDomains(domains []string) {
for _, host := range domains {
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3)
VALUES ($1, 'hold', $2, $3)
ON CONFLICT(host) DO NOTHING
`, host, now, getTLD(host))
if err != nil {
@@ -200,7 +237,7 @@ func (c *Crawler) ImportTestDomains(domains []string) {
}
}
// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
// ImportDomainsFromFile reads a vertices file and stores new domains as "hold"
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
file, err := os.Open(filename)
if err != nil {
@@ -246,7 +283,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
const batchSize = 1000
const batchSize = 100000
now := time.Now()
totalImported := 0
batchCount := 0
@@ -284,12 +321,12 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
break
}
// Build rows for copy, applying auto-deny for spam patterns
// Build rows for copy, applying auto-skip for spam patterns
rows := make([][]interface{}, len(domains))
for i, d := range domains {
status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
status := "hold"
if shouldAutoSkipDomain(d.host) {
status = "skip"
}
rows[i] = []interface{}{d.host, status, now, d.tld}
}
@@ -306,9 +343,9 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
if err != nil {
// Fall back to individual inserts with ON CONFLICT
for _, d := range domains {
status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
status := "hold"
if shouldAutoSkipDomain(d.host) {
status = "skip"
}
c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
@@ -361,7 +398,7 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
now := time.Now()
count := 0
const batchSize = 1000
const batchSize = 100000
type domainEntry struct {
host string
@@ -391,11 +428,11 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
break
}
// Insert with ON CONFLICT, applying auto-deny for spam patterns
// Insert with ON CONFLICT, applying auto-skip for spam patterns
for _, d := range domains {
status := "unchecked"
if shouldAutoDenyDomain(d.host) {
status = "denied"
status := "hold"
if shouldAutoSkipDomain(d.host) {
status = "skip"
}
result, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
+69 -25
View File
@@ -119,6 +119,7 @@ type Item struct {
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
@@ -171,25 +172,25 @@ type Feed struct {
NoUpdate int `json:"no_update"` // Consecutive checks with no change
// Publishing to PDS
PublishStatus string `json:"publish_status"` // "held", "pass", "deny"
PublishStatus string `json:"publish_status"` // "hold", "pass", "skip"
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
}
// saveFeed stores a feed in PostgreSQL
func (c *Crawler) saveFeed(feed *Feed) error {
// Default publishStatus to "held" if not set
// Auto-deny feeds with no language or unsupported type
// Default publishStatus to "hold" if not set
// Auto-skip feeds with no language or non-English language
// Auto-pass feeds from our own domain
publishStatus := feed.PublishStatus
if publishStatus == "" {
if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
publishStatus = "pass"
} else if feed.Language == "" {
publishStatus = "deny"
} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
publishStatus = "skip"
} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
publishStatus = "deny"
publishStatus = "skip"
} else {
publishStatus = "held"
publishStatus = "hold"
}
}
@@ -330,7 +331,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
if publishStatus != nil {
feed.PublishStatus = *publishStatus
} else {
feed.PublishStatus = "held"
feed.PublishStatus = "hold"
}
feed.PublishAccount = StringValue(publishAccount)
@@ -526,7 +527,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
if publishStatus != nil {
feed.PublishStatus = *publishStatus
} else {
feed.PublishStatus = "held"
feed.PublishStatus = "hold"
}
feed.PublishAccount = StringValue(publishAccount)
@@ -558,10 +559,19 @@ func (c *Crawler) saveItem(item *Item) error {
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := c.db.Exec(`
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
@@ -573,12 +583,13 @@ func (c *Crawler) saveItem(item *Item) error {
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
return err
}
@@ -620,10 +631,19 @@ func (c *Crawler) saveItems(items []*Item) error {
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := tx.Exec(context.Background(), `
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT(feed_url, guid) DO UPDATE SET
title = EXCLUDED.title,
link = EXCLUDED.link,
@@ -635,12 +655,13 @@ func (c *Crawler) saveItems(items []*Item) error {
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
enclosure_length = EXCLUDED.enclosure_length,
image_urls = EXCLUDED.image_urls
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
)
if err != nil {
continue // Skip failed items
@@ -654,7 +675,7 @@ func (c *Crawler) saveItems(items []*Item) error {
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1
@@ -674,7 +695,7 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
tsquery := ToSearchQuery(query)
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE search_vector @@ to_tsquery('english', $1)
@@ -698,14 +719,14 @@ func scanItems(rows pgx.Rows) ([]*Item, error) {
var pubDate, updatedAt, publishedAt *time.Time
var enclosureUrl, enclosureType *string
var enclosureLength *int64
var imageUrlsJSON *string
var imageUrlsJSON, tagsJSON *string
var publishedUri *string
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
&publishedAt, &publishedUri,
); err != nil {
continue
@@ -739,6 +760,14 @@ func scanItems(rows pgx.Rows) ([]*Item, error) {
}
}
// Parse tags JSON
if tagsJSON != nil && *tagsJSON != "" {
var tags []string
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
item.Tags = tags
}
}
item.PublishedAt = TimeValue(publishedAt)
item.PublishedUri = StringValue(publishedUri)
@@ -907,6 +936,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
@@ -939,6 +973,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
} else {
feed.Status = "error"
}
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, nil
}
@@ -952,6 +991,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
}
c.saveFeed(feed)
return false, err
}
@@ -992,7 +1036,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
return true, nil
}
// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'deny')
// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
// If status is 'pass', the account handle is also set (auto-derived if empty)
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
feedURL = normalizeURL(feedURL)
@@ -1031,7 +1075,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
return scanFeeds(rows)
}
// GetPublishCandidates returns feeds that are held for review and have items
// GetPublishCandidates returns feeds that are hold for review and have items
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, category, title, description, language, site_url,
@@ -1044,7 +1088,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
no_update,
publish_status, publish_account
FROM feeds
WHERE publish_status = 'held' AND item_count > 0 AND status = 'active'
WHERE publish_status = 'hold' AND item_count > 0 AND status = 'active'
ORDER BY item_count DESC
LIMIT $1
`, limit)
@@ -1060,7 +1104,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
enclosure_url, enclosure_type, enclosure_length, image_urls,
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1 AND published_at IS NULL
+4 -1
View File
@@ -56,7 +56,10 @@ func main() {
// Publish loop (background) - autopublishes items for approved feeds
go crawler.StartPublishLoop()
// Crawl loop (background)
// Domain check loop (background) - verifies approved domains are reachable
go crawler.StartDomainCheckLoop()
// Crawl loop (background) - crawls checked domains for feeds
go crawler.StartCrawlLoop()
// Wait for shutdown signal
+50 -18
View File
@@ -41,6 +41,7 @@ type RSSItem struct {
Author string `xml:"author"`
Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
PubDate string `xml:"pubDate"`
Categories []string `xml:"category"`
Enclosure *RSSEnclosure `xml:"enclosure"`
// iTunes item elements
ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
@@ -82,14 +83,20 @@ type AtomFeed struct {
}
type AtomEntry struct {
ID string `xml:"id"`
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
ID string `xml:"id"`
Title string `xml:"title"`
Links []AtomLink `xml:"link"`
Summary string `xml:"summary"`
Content AtomContent `xml:"content"`
Author AtomAuthor `xml:"author"`
Updated string `xml:"updated"`
Published string `xml:"published"`
Categories []AtomCategory `xml:"category"`
}
type AtomCategory struct {
Term string `xml:"term,attr"`
Label string `xml:"label,attr"`
}
type AtomContent struct {
@@ -222,6 +229,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
// Extract images from various sources
item.ImageURLs = extractItemImages(rssItem)
// Extract categories/tags
if len(rssItem.Categories) > 0 {
item.Tags = rssItem.Categories
}
items = append(items, item)
}
@@ -324,6 +336,20 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
}
}
// Extract categories/tags
if len(entry.Categories) > 0 {
for _, cat := range entry.Categories {
// Prefer label, fall back to term
tag := cat.Label
if tag == "" {
tag = cat.Term
}
if tag != "" {
item.Tags = append(item.Tags, tag)
}
}
}
items = append(items, item)
}
@@ -514,16 +540,17 @@ type JSONFeed struct {
}
type JSONFeedItem struct {
ID string `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
ContentHTML string `json:"content_html"`
ContentText string `json:"content_text"`
Summary string `json:"summary"`
Image string `json:"image"`
DatePublished string `json:"date_published"`
DateModified string `json:"date_modified"`
Authors []JSONFeedAuthor `json:"authors"`
ID string `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
ContentHTML string `json:"content_html"`
ContentText string `json:"content_text"`
Summary string `json:"summary"`
Image string `json:"image"`
DatePublished string `json:"date_published"`
DateModified string `json:"date_modified"`
Authors []JSONFeedAuthor `json:"authors"`
Tags []string `json:"tags"`
Attachments []JSONFeedAttachment `json:"attachments"`
}
@@ -600,6 +627,11 @@ func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
item.ImageURLs = []string{ji.Image}
}
// Tags
if len(ji.Tags) > 0 {
item.Tags = ji.Tags
}
// Attachments (enclosures)
for _, att := range ji.Attachments {
if att.URL != "" {
+367 -197
View File
@@ -12,13 +12,14 @@ import (
"io"
"net/http"
"net/url"
"os"
"regexp"
"strings"
"time"
"unicode/utf8"
_ "golang.org/x/image/webp"
"go.deanishe.net/favicon"
"golang.org/x/image/draw"
_ "golang.org/x/image/webp"
)
// Publisher handles posting items to AT Protocol PDS
@@ -29,24 +30,24 @@ type Publisher struct {
// PDSSession holds authentication info for a PDS account
type PDSSession struct {
DID string `json:"did"`
Handle string `json:"handle"`
AccessJwt string `json:"accessJwt"`
RefreshJwt string `json:"refreshJwt"`
DID string `json:"did"`
Handle string `json:"handle"`
AccessJwt string `json:"accessJwt"`
RefreshJwt string `json:"refreshJwt"`
}
// BskyPost represents an app.bsky.feed.post record
type BskyPost struct {
Type string `json:"$type"`
Text string `json:"text"`
CreatedAt string `json:"createdAt"`
Facets []BskyFacet `json:"facets,omitempty"`
Embed *BskyEmbed `json:"embed,omitempty"`
Type string `json:"$type"`
Text string `json:"text"`
CreatedAt string `json:"createdAt"`
Facets []BskyFacet `json:"facets,omitempty"`
Embed *BskyEmbed `json:"embed,omitempty"`
}
type BskyFacet struct {
Index BskyByteSlice `json:"index"`
Features []BskyFeature `json:"features"`
Index BskyByteSlice `json:"index"`
Features []BskyFeature `json:"features"`
}
type BskyByteSlice struct {
@@ -57,12 +58,13 @@ type BskyByteSlice struct {
type BskyFeature struct {
Type string `json:"$type"`
URI string `json:"uri,omitempty"`
Tag string `json:"tag,omitempty"` // For hashtag facets
}
type BskyEmbed struct {
Type string `json:"$type"`
External *BskyExternal `json:"external,omitempty"`
Images []BskyImage `json:"images,omitempty"`
Type string `json:"$type"`
External *BskyExternal `json:"external,omitempty"`
Images []BskyImage `json:"images,omitempty"`
}
type BskyExternal struct {
@@ -73,9 +75,9 @@ type BskyExternal struct {
}
type BskyImage struct {
Alt string `json:"alt"`
Image *BlobRef `json:"image"`
AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"`
Alt string `json:"alt"`
Image *BlobRef `json:"image"`
AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"`
}
type BskyAspectRatio struct {
@@ -209,6 +211,66 @@ func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string
return result.Code, nil
}
// FollowAccount creates a follow record from the authenticated session to the target DID
func (p *Publisher) FollowAccount(session *PDSSession, targetDID string) error {
// Create follow record
now := time.Now().UTC().Format(time.RFC3339)
record := map[string]interface{}{
"$type": "app.bsky.graph.follow",
"subject": targetDID,
"createdAt": now,
}
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.graph.follow",
"record": record,
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("follow failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// FollowAsDirectory logs in as the directory account and follows the target DID
func (p *Publisher) FollowAsDirectory(targetDID string) error {
dirHandle := os.Getenv("DIRECTORY_HANDLE")
dirPassword := os.Getenv("DIRECTORY_PASSWORD")
if dirHandle == "" || dirPassword == "" {
// Silently skip if directory account not configured
return nil
}
session, err := p.CreateSession(dirHandle, dirPassword)
if err != nil {
return fmt.Errorf("directory login failed: %w", err)
}
return p.FollowAccount(session, targetDID)
}
// TID alphabet for base32-sortable encoding
const tidAlphabet = "234567abcdefghijklmnopqrstuvwxyz"
@@ -268,6 +330,116 @@ func extractURLs(text string) []string {
return urls
}
// toCamelCaseTag converts a tag string to camelCase hashtag format
// e.g., "Lagos News" -> "lagosNews", "AI" -> "ai", "machine learning" -> "machineLearning"
func toCamelCaseTag(tag string) string {
tag = strings.TrimSpace(tag)
if tag == "" {
return ""
}
// Remove any # prefix if present
tag = strings.TrimPrefix(tag, "#")
// Split on spaces and other separators
words := strings.FieldsFunc(tag, func(r rune) bool {
return r == ' ' || r == '-' || r == '_'
})
if len(words) == 0 {
return ""
}
// If single word, return lowercased
if len(words) == 1 {
return strings.ToLower(words[0])
}
// Multiple words: lowercase first word, capitalize first letter of subsequent words
var result strings.Builder
for i, word := range words {
if word == "" {
continue
}
runes := []rune(word)
if len(runes) > 0 {
if i == 0 || result.Len() == 0 {
// First word: all lowercase
result.WriteString(strings.ToLower(word))
} else {
// Subsequent words: capitalize first letter, lowercase rest
result.WriteString(strings.ToUpper(string(runes[0])))
if len(runes) > 1 {
result.WriteString(strings.ToLower(string(runes[1:])))
}
}
}
}
return result.String()
}
// formatTagsForPost converts item tags to hashtag text and facets
// Returns the hashtag line (e.g., "#AI #MachineLearning #News") and facets
func formatTagsForPost(tags []string, textOffset int) (string, []BskyFacet) {
if len(tags) == 0 {
return "", nil
}
// Dedupe and convert tags
seen := make(map[string]bool)
var hashtags []string
for _, tag := range tags {
camel := toCamelCaseTag(tag)
if camel == "" || seen[strings.ToLower(camel)] {
continue
}
seen[strings.ToLower(camel)] = true
hashtags = append(hashtags, camel)
}
if len(hashtags) == 0 {
return "", nil
}
// Limit to 5 tags to keep post compact
if len(hashtags) > 5 {
hashtags = hashtags[:5]
}
// Build the hashtag line and facets
var line strings.Builder
var facets []BskyFacet
currentOffset := textOffset
for i, ht := range hashtags {
if i > 0 {
line.WriteString(" ")
currentOffset++
}
hashtagText := "#" + ht
byteStart := currentOffset
byteEnd := currentOffset + len(hashtagText)
line.WriteString(hashtagText)
facets = append(facets, BskyFacet{
Index: BskyByteSlice{
ByteStart: byteStart,
ByteEnd: byteEnd,
},
Features: []BskyFeature{{
Type: "app.bsky.richtext.facet#tag",
Tag: ht,
}},
})
currentOffset = byteEnd
}
return line.String(), facets
}
// PublishItem posts a feed item to the PDS
// Returns the AT URI of the created record, or error
func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
@@ -316,81 +488,11 @@ func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error)
}
}
// Build post text: title + link labels
// Bluesky has 300 grapheme limit - use rune count as approximation
const maxGraphemes = 295 // Leave some margin
// Create labeled links: "Article", "Audio", etc.
type labeledLink struct {
Label string
URL string
// Get the primary URL (article link)
primaryURL := ""
if len(allURLs) > 0 {
primaryURL = allURLs[0]
}
var links []labeledLink
for i, u := range allURLs {
if i == 0 {
// First URL is the article link
links = append(links, labeledLink{Label: "Article", URL: u})
} else if item.Enclosure != nil && u == item.Enclosure.URL {
// Enclosure URL - label based on type
encType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(encType, "audio/") {
links = append(links, labeledLink{Label: "Audio", URL: u})
} else if strings.HasPrefix(encType, "video/") {
links = append(links, labeledLink{Label: "Video", URL: u})
} else {
links = append(links, labeledLink{Label: "Media", URL: u})
}
} else if strings.Contains(u, "news.ycombinator.com") {
links = append(links, labeledLink{Label: "Comments", URL: u})
} else {
links = append(links, labeledLink{Label: "Link", URL: u})
}
}
// Calculate space needed for labels (in runes)
// Format: "Article · Audio" or just "Article"
labelSpace := 0
for i, link := range links {
labelSpace += utf8.RuneCountInString(link.Label)
if i > 0 {
labelSpace += 3 // " · " separator
}
}
labelSpace += 2 // \n\n before labels
// Truncate title if needed
title := item.Title
titleRunes := utf8.RuneCountInString(title)
maxTitleRunes := maxGraphemes - labelSpace - 3 // -3 for "..."
if titleRunes+labelSpace > maxGraphemes {
if maxTitleRunes > 10 {
runes := []rune(title)
if len(runes) > maxTitleRunes {
title = string(runes[:maxTitleRunes]) + "..."
}
} else {
runes := []rune(title)
if len(runes) > 50 {
title = string(runes[:50]) + "..."
}
}
}
// Build final text with labels
var textBuilder strings.Builder
textBuilder.WriteString(title)
if len(links) > 0 {
textBuilder.WriteString("\n\n")
for i, link := range links {
if i > 0 {
textBuilder.WriteString(" · ")
}
textBuilder.WriteString(link.Label)
}
}
text := textBuilder.String()
// Use original publication date if available, otherwise current time
createdAt := time.Now()
@@ -398,60 +500,34 @@ func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error)
createdAt = item.PubDate
}
// Build post text with hashtags if available
// The link card shows the title, description, and thumbnail
// Clicking the card doesn't trigger the "leaving Bluesky" warning
postText := ""
var facets []BskyFacet
if len(item.Tags) > 0 {
tagLine, tagFacets := formatTagsForPost(item.Tags, 0)
postText = tagLine
facets = tagFacets
}
post := BskyPost{
Type: "app.bsky.feed.post",
Text: text,
Text: postText,
CreatedAt: createdAt.Format(time.RFC3339),
Facets: facets,
}
// Add facets for labeled links
// Find each label in the text and create a facet linking to its URL
searchPos := len(title) + 2 // Start after title + \n\n
for _, link := range links {
labelStart := strings.Index(text[searchPos:], link.Label)
if labelStart >= 0 {
labelStart += searchPos
byteStart := len(text[:labelStart])
byteEnd := byteStart + len(link.Label)
post.Facets = append(post.Facets, BskyFacet{
Index: BskyByteSlice{
ByteStart: byteStart,
ByteEnd: byteEnd,
},
Features: []BskyFeature{
{
Type: "app.bsky.richtext.facet#link",
URI: link.URL,
},
},
})
searchPos = labelStart + len(link.Label)
}
}
// Decide embed type based on content
// Priority: images > external link card
if len(item.ImageURLs) > 0 {
// Try to upload images (up to 4)
uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
if len(uploadedImages) > 0 {
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.images",
Images: uploadedImages,
}
}
}
// Fall back to external embed if no images were uploaded
if post.Embed == nil && len(allURLs) > 0 {
// Always use external embed (link card) - clicking the card doesn't show "leaving" warning
if primaryURL != "" {
external := &BskyExternal{
URI: allURLs[0],
URI: primaryURL,
Title: item.Title,
Description: truncate(stripHTML(item.Description), 300),
}
// Try to add thumbnail from first image
// Add thumbnail from first image if available
if len(item.ImageURLs) > 0 {
if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
external.Thumb = thumb
@@ -548,15 +624,15 @@ func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altTex
return images
}
// fetchAndUploadImage downloads an image and uploads it to the PDS
// FetchFavicon tries to get a favicon URL for a site
// Uses go.deanishe.net/favicon library which parses HTML, manifests, and checks common paths
// Returns the favicon URL or empty string if not found
func (p *Publisher) FetchFavicon(siteURL string) string {
if siteURL == "" {
return ""
}
// Parse the site URL to get the host
// Ensure URL has scheme
if !strings.Contains(siteURL, "://") {
siteURL = "https://" + siteURL
}
@@ -565,23 +641,81 @@ func (p *Publisher) FetchFavicon(siteURL string) string {
return ""
}
// Try common favicon locations
faviconURLs := []string{
fmt.Sprintf("https://%s/favicon.ico", u.Host),
fmt.Sprintf("https://%s/favicon.png", u.Host),
fmt.Sprintf("https://%s/apple-touch-icon.png", u.Host),
}
// Create finder with custom HTTP client
// Note: Don't use IgnoreNoSize as it filters out valid favicon.ico files that don't have size metadata
finder := favicon.New(
favicon.WithClient(p.httpClient),
)
for _, faviconURL := range faviconURLs {
resp, err := p.httpClient.Head(faviconURL)
if err != nil {
continue
// Find icons - library checks HTML <link> tags, manifests, OG images, common paths
icons, err := finder.Find(siteURL)
if err == nil && len(icons) > 0 {
// Filter and score icons for avatar use
// Prefer: square icons, reasonable size, PNG format, actual favicons over OG images
var bestIcon string
var bestScore int
for _, icon := range icons {
// Skip tiny icons (likely tracking pixels)
if icon.Width > 0 && icon.Width < 32 {
continue
}
// Skip Open Graph images (meant for link previews, usually wide banners)
lowerURL := strings.ToLower(icon.URL)
if strings.Contains(lowerURL, "og-image") || strings.Contains(lowerURL, "og_image") ||
strings.Contains(lowerURL, "opengraph") || strings.Contains(lowerURL, "twitter") {
continue
}
// Skip wide images (aspect ratio > 1.5 means it's a banner, not an icon)
if icon.Width > 0 && icon.Height > 0 {
ratio := float64(icon.Width) / float64(icon.Height)
if ratio > 1.5 || ratio < 0.67 {
continue
}
}
// Score the icon
score := 0
// Prefer actual favicon paths
if strings.Contains(lowerURL, "favicon") || strings.Contains(lowerURL, "icon") ||
strings.Contains(lowerURL, "apple-touch") {
score += 100
}
// Prefer PNG over other formats
if icon.MimeType == "image/png" {
score += 50
} else if icon.MimeType == "image/x-icon" || strings.HasSuffix(lowerURL, ".ico") {
score += 40
} else if icon.MimeType == "image/jpeg" {
score += 10 // JPEG less preferred for icons
}
// Prefer larger icons (but not too large)
if icon.Width >= 64 && icon.Width <= 512 {
score += 30
} else if icon.Width > 0 {
score += 10
}
if score > bestScore {
bestScore = score
bestIcon = icon.URL
}
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
contentType := resp.Header.Get("Content-Type")
if strings.HasPrefix(contentType, "image/") || strings.HasSuffix(faviconURL, ".ico") {
return faviconURL
if bestIcon != "" {
return bestIcon
}
// Fall back to first non-OG icon
for _, icon := range icons {
lowerURL := strings.ToLower(icon.URL)
if !strings.Contains(lowerURL, "og-image") && !strings.Contains(lowerURL, "og_image") {
return icon.URL
}
}
}
@@ -763,8 +897,9 @@ func stripHTML(s string) string {
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
//
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18 // PDS limit for first segment
@@ -792,7 +927,7 @@ func DeriveHandleFromFeed(feedURL string) string {
skipPathWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true, "blog": true,
"nyt": true,
}
var pathParts []string
@@ -1274,8 +1409,8 @@ func (p *Publisher) DeleteAccount(adminPassword, did string) error {
}
// FetchFavicon downloads a favicon/icon from a URL
// Uses go.deanishe.net/favicon library to find the best icon
func FetchFavicon(siteURL string) ([]byte, string, error) {
// Try common favicon locations
if !strings.HasPrefix(siteURL, "http") {
siteURL = "https://" + siteURL
}
@@ -1285,48 +1420,83 @@ func FetchFavicon(siteURL string) ([]byte, string, error) {
return nil, "", err
}
baseURL := u.Scheme + "://" + u.Host
// Try apple-touch-icon first (usually higher quality)
iconURLs := []string{
baseURL + "/apple-touch-icon.png",
baseURL + "/apple-touch-icon-precomposed.png",
baseURL + "/favicon.png",
baseURL + "/favicon.ico",
}
client := &http.Client{Timeout: 10 * time.Second}
for _, iconURL := range iconURLs {
resp, err := client.Get(iconURL)
if err != nil {
continue
}
defer resp.Body.Close()
// Use favicon library to find icons
finder := favicon.New(
favicon.WithClient(client),
favicon.IgnoreNoSize,
)
if resp.StatusCode != http.StatusOK {
continue
}
data, err := io.ReadAll(resp.Body)
if err != nil {
continue
}
// Determine mime type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
if strings.HasSuffix(iconURL, ".png") {
contentType = "image/png"
} else if strings.HasSuffix(iconURL, ".ico") {
contentType = "image/x-icon"
} else {
contentType = "image/png" // default
}
}
return data, contentType, nil
icons, err := finder.Find(siteURL)
if err != nil || len(icons) == 0 {
// Fallback to Google's favicon service
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
return fetchIconBytes(client, googleURL)
}
return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
// Try icons in order (sorted by size, largest first)
// Prefer PNG/JPEG over ICO
var iconURLs []string
for _, icon := range icons {
if icon.Width > 0 && icon.Width < 32 {
continue // Skip tiny icons
}
if icon.MimeType == "image/png" || icon.MimeType == "image/jpeg" {
iconURLs = append([]string{icon.URL}, iconURLs...) // Prepend PNG/JPEG
} else {
iconURLs = append(iconURLs, icon.URL)
}
}
// If no good icons, use all of them
if len(iconURLs) == 0 {
for _, icon := range icons {
iconURLs = append(iconURLs, icon.URL)
}
}
// Try to download each icon
for _, iconURL := range iconURLs {
data, mimeType, err := fetchIconBytes(client, iconURL)
if err == nil && len(data) > 0 {
return data, mimeType, nil
}
}
// Final fallback to Google
googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
return fetchIconBytes(client, googleURL)
}
// fetchIconBytes downloads an icon and returns its bytes and mime type
func fetchIconBytes(client *http.Client, iconURL string) ([]byte, string, error) {
resp, err := client.Get(iconURL)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", err
}
// Determine mime type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
if strings.HasSuffix(iconURL, ".png") {
contentType = "image/png"
} else if strings.HasSuffix(iconURL, ".ico") {
contentType = "image/x-icon"
} else {
contentType = "image/png"
}
}
return data, contentType, nil
}
+807 -680
View File
File diff suppressed because it is too large Load Diff