Dashboard UI overhaul: inline feed details, TLD filtering, status improvements

- Feed details now expand inline instead of navigating to new page - Add TLD section headers with domains sorted by TLD then name - Add TLD filter button to show/hide domain sections by TLD - Feed status behavior: pass creates account, hold crawls only, skip stops, drop cleans up - Auto-follow new accounts from directory account (1440.news) - Fix handle derivation (removed duplicate .1440.news suffix) - Increase domain import batch size to 100k - Various bug fixes for account creation and profile updates Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 20:51:05 -05:00
parent 5908a8c03e
commit 3999e96f26
11 changed files with 2603 additions and 1178 deletions
@@ -45,10 +45,11 @@ Multi-file Go application that crawls websites for RSS/Atom feeds, stores them i

 ### Concurrent Loops (main.go)

-The application runs six independent goroutine loops:
- **Import loop** - Reads `vertices.txt.gz` and inserts domains into DB in 10k batches
- **Crawl loop** - Worker pool processes unchecked domains, discovers feeds
- **Check loop** - Worker pool re-checks known feeds for updates (conditional HTTP)
+The application runs seven independent goroutine loops:
+- **Import loop** - Reads `vertices.txt.gz` and inserts domains into DB in 10k batches (status='hold')
+- **Domain check loop** - HEAD requests to verify approved domains are reachable
+- **Crawl loop** - Worker pool crawls verified domains for feed discovery
+- **Feed check loop** - Worker pool re-checks known feeds for updates (conditional HTTP)
 - **Stats loop** - Updates cached dashboard statistics every minute
 - **Cleanup loop** - Removes items older than 12 months (weekly)
 - **Publish loop** - Autopublishes items from approved feeds to AT Protocol PDS
@@ -70,8 +71,8 @@ The application runs six independent goroutine loops:
 ### Database Schema

 PostgreSQL with pgx driver, using connection pooling:
- **domains** - Hosts to crawl (status: unchecked/checked/error)
- **feeds** - Discovered RSS/Atom feeds with metadata and cache headers
+- **domains** - Hosts to crawl (status: hold/pass/skip/fail)
+- **feeds** - Discovered RSS/Atom feeds with metadata and cache headers (publish_status: hold/pass/skip)
 - **items** - Individual feed entries (guid + feed_url unique)
 - **search_vector** - GENERATED tsvector columns for full-text search (GIN indexed)

@@ -79,11 +80,12 @@ Column naming: snake_case (e.g., `source_host`, `pub_date`, `item_count`)

 ### Crawl Logic

-1. Domain picked from `unchecked` status (random order)
-2. Try HTTPS, fall back to HTTP
-3. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10
-4. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
-5. Parse discovered feeds for metadata, save with next_crawl_at
+1. Domain manually approved (status set to 'pass')
+2. Check stage: HEAD request verifies domain is reachable, sets last_checked_at
+3. Crawl stage: Full recursive crawl (HTTPS, fallback HTTP)
+4. Recursive crawl up to MaxDepth=10, MaxPagesPerHost=10
+5. Extract `<link rel="alternate">` and anchor hrefs containing rss/atom/feed
+6. Parse discovered feeds for metadata, save with next_crawl_at

 ### Feed Checking

@@ -92,7 +94,16 @@ Uses conditional HTTP (ETag, If-Modified-Since). Adaptive backoff: base 100s + 1
 ### Publishing

 Feeds with `publish_status = 'pass'` have their items automatically posted to AT Protocol.
-Status values: `held` (default), `pass` (approved), `deny` (rejected).
+Status values: `hold` (default/pending review), `pass` (approved), `skip` (rejected).
+
+### Domain Processing (Two-Stage)
+
+1. **Check stage** - HEAD request to verify domain is reachable
+2. **Crawl stage** - Full recursive crawl for feed discovery
+
+Domain status values: `hold` (pending), `pass` (approved), `skip` (rejected), `fail` (error).
+Domains starting with a digit (except 1440.news) are auto-skipped.
+Non-English feeds are auto-skipped.

 ## AT Protocol Integration

@@ -213,14 +213,14 @@ func (c *Crawler) StartPublishLoop() {
 						if displayName == "" {
 							displayName = account
 						}
-						// Build description with feed URL
-						description := feedInfo.Description
+						// Build description with feed URL (strip HTML tags)
+						description := stripHTML(feedInfo.Description)
 						if description == "" {
 							description = "News feed via 1440.news"
 						}
-						// Add feed URL to description
+						// Add feed URL as first line of description
 						feedURLFull := "https://" + item.FeedURL
-						description = description + "\n\n" + feedURLFull
+						description = feedURLFull + "\n\n" + description
 						// Truncate if needed
 						if len(displayName) > 64 {
 							displayName = displayName[:61] + "..."
@@ -230,8 +230,13 @@ func (c *Crawler) StartPublishLoop() {
 						}
 						// Fetch and upload favicon as avatar
 						var avatar *BlobRef
-						if feedInfo.SiteURL != "" {
-							faviconURL := publisher.FetchFavicon(feedInfo.SiteURL)
+						faviconSource := feedInfo.SiteURL
+						if faviconSource == "" {
+							// Fallback to deriving from feed URL
+							faviconSource = feedInfo.SourceHost
+						}
+						if faviconSource != "" {
+							faviconURL := publisher.FetchFavicon(faviconSource)
 							if faviconURL != "" {
 								avatar = publisher.fetchAndUploadImage(session, faviconURL)
 							}
@@ -241,6 +246,13 @@ func (c *Crawler) StartPublishLoop() {
 						} else {
 							fmt.Printf("Publish: set profile for %s\n", account)
 						}
+
+						// Have directory account follow this new account
+						if err := publisher.FollowAsDirectory(session.DID); err != nil {
+							fmt.Printf("Publish: directory follow failed for %s: %v\n", account, err)
+						} else {
+							fmt.Printf("Publish: directory now following %s\n", account)
+						}
 					}
 				}
 				sessions[account] = session
@@ -256,15 +268,6 @@ func (c *Crawler) StartPublishLoop() {
 					fmt.Printf("Publish: short URL failed for %s: %v\n", item.Link[:min(40, len(item.Link))], err)
 				}
 			}
-			if item.Enclosure != nil && item.Enclosure.URL != "" {
-				if shortURL, err := c.GetShortURLForPost(item.Enclosure.URL, &item.ID, item.FeedURL); err == nil {
-					itemToPublish.Enclosure = &Enclosure{
-						URL:    shortURL,
-						Type:   item.Enclosure.Type,
-						Length: item.Enclosure.Length,
-					}
-				}
-			}

 			// Publish the item
 			uri, err := publisher.PublishItem(session, &itemToPublish)
@@ -305,14 +308,15 @@ type FeedInfo struct {
 	Title       string
 	Description string
 	SiteURL     string
+	SourceHost  string
 }

 // getFeedInfo returns feed metadata for profile setup
 func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
-	var title, description, siteURL *string
+	var title, description, siteURL, sourceHost *string
 	err := c.db.QueryRow(`
-		SELECT title, description, site_url FROM feeds WHERE url = $1
-	`, feedURL).Scan(&title, &description, &siteURL)
+		SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
+	`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
 	if err != nil {
 		return nil
 	}
@@ -320,13 +324,14 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
 		Title:       StringValue(title),
 		Description: StringValue(description),
 		SiteURL:     StringValue(siteURL),
+		SourceHost:  StringValue(sourceHost),
 	}
 }

 // RefreshAllProfiles updates profiles for all existing accounts with feed URLs
 func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
 	rows, err := c.db.Query(`
-		SELECT url, title, description, site_url, publish_account
+		SELECT url, title, description, site_url, source_host, publish_account
 		FROM feeds
 		WHERE publish_account IS NOT NULL AND publish_account <> ''
 	`)
@@ -338,8 +343,8 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)

 	for rows.Next() {
 		var feedURL, account string
-		var title, description, siteURL *string
-		if err := rows.Scan(&feedURL, &title, &description, &siteURL, &account); err != nil {
+		var title, description, siteURL, sourceHost *string
+		if err := rows.Scan(&feedURL, &title, &description, &siteURL, &sourceHost, &account); err != nil {
 			continue
 		}

@@ -355,13 +360,13 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
 		if displayName == "" {
 			displayName = account
 		}
-		desc := StringValue(description)
+		desc := stripHTML(StringValue(description))
 		if desc == "" {
 			desc = "News feed via 1440.news"
 		}
-		// Add feed URL
+		// Add feed URL as first line
 		feedURLFull := "https://" + feedURL
-		desc = desc + "\n\n" + feedURLFull
+		desc = feedURLFull + "\n\n" + desc

 		// Truncate if needed
 		if len(displayName) > 64 {
@@ -373,8 +378,13 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)

 		// Fetch and upload favicon as avatar
 		var avatar *BlobRef
-		if siteURL != nil && *siteURL != "" {
-			faviconURL := publisher.FetchFavicon(*siteURL)
+		faviconSource := StringValue(siteURL)
+		if faviconSource == "" {
+			// Fallback to source host
+			faviconSource = StringValue(sourceHost)
+		}
+		if faviconSource != "" {
+			faviconURL := publisher.FetchFavicon(faviconSource)
 			if faviconURL != "" {
 				avatar = publisher.fetchAndUploadImage(session, faviconURL)
 			}
@@ -392,7 +402,7 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string)
 func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
 	rows, err := c.db.Query(`
 		SELECT i.id, i.feed_url, i.guid, i.title, i.link, i.description, i.content,
-		       i.author, i.pub_date, i.discovered_at, i.image_urls,
+		       i.author, i.pub_date, i.discovered_at, i.image_urls, i.tags,
 		       i.enclosure_url, i.enclosure_type, i.enclosure_length
 		FROM items i
 		JOIN feeds f ON i.feed_url = f.url
@@ -410,13 +420,13 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
 	var items []Item
 	for rows.Next() {
 		var item Item
-		var guid, title, link, description, content, author, imageURLsJSON *string
+		var guid, title, link, description, content, author, imageURLsJSON, tagsJSON *string
 		var pubDate, discoveredAt *time.Time
 		var enclosureURL, enclosureType *string
 		var enclosureLength *int64

 		err := rows.Scan(&item.ID, &item.FeedURL, &guid, &title, &link, &description,
-			&content, &author, &pubDate, &discoveredAt, &imageURLsJSON,
+			&content, &author, &pubDate, &discoveredAt, &imageURLsJSON, &tagsJSON,
 			&enclosureURL, &enclosureType, &enclosureLength)
 		if err != nil {
 			continue
@@ -436,6 +446,11 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
 			json.Unmarshal([]byte(*imageURLsJSON), &item.ImageURLs)
 		}

+		// Parse tags from JSON array
+		if tagsJSON != nil && *tagsJSON != "" {
+			json.Unmarshal([]byte(*tagsJSON), &item.Tags)
+		}
+
 		// Parse enclosure
 		if enclosureURL != nil && *enclosureURL != "" {
 			item.Enclosure = &Enclosure{
@@ -453,7 +468,87 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) {
 	return items, nil
 }

-// StartCrawlLoop runs the domain crawling loop independently
+// StartDomainCheckLoop runs HEAD requests on approved domains to verify they're reachable
+func (c *Crawler) StartDomainCheckLoop() {
+	numWorkers := runtime.NumCPU()
+	if numWorkers < 1 {
+		numWorkers = 1
+	}
+
+	// Buffered channel for domain work
+	workChan := make(chan *Domain, 256)
+
+	// Start workers
+	for i := 0; i < numWorkers; i++ {
+		go func() {
+			for domain := range workChan {
+				// Do HEAD request to verify domain is reachable
+				checkErr := c.checkDomain(domain.Host)
+				errStr := ""
+				if checkErr != nil {
+					errStr = checkErr.Error()
+				}
+				if err := c.markDomainChecked(domain.Host, errStr); err != nil {
+					fmt.Printf("Error marking domain %s as checked: %v\n", domain.Host, err)
+				}
+			}
+		}()
+	}
+
+	const fetchSize = 1000
+	for {
+		domains, err := c.GetDomainsToCheck(fetchSize)
+		if err != nil {
+			fmt.Printf("Error fetching domains to check: %v\n", err)
+		}
+
+		if len(domains) == 0 {
+			time.Sleep(1 * time.Second)
+			continue
+		}
+
+		fmt.Printf("%s domain-check: %d domains to verify\n", time.Now().Format("15:04:05"), len(domains))
+
+		for _, domain := range domains {
+			workChan <- domain
+		}
+
+		time.Sleep(1 * time.Second)
+	}
+}
+
+// checkDomain performs a HEAD request to verify a domain is reachable
+func (c *Crawler) checkDomain(host string) error {
+	url := "https://" + host
+	req, err := http.NewRequest("HEAD", url, nil)
+	if err != nil {
+		return err
+	}
+	req.Header.Set("User-Agent", c.UserAgent)
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		// Try HTTP fallback
+		url = "http://" + host
+		req, err = http.NewRequest("HEAD", url, nil)
+		if err != nil {
+			return err
+		}
+		req.Header.Set("User-Agent", c.UserAgent)
+		resp, err = c.client.Do(req)
+		if err != nil {
+			return err
+		}
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+	return nil
+}
+
+// StartCrawlLoop runs the domain crawling loop independently (crawls checked domains)
 func (c *Crawler) StartCrawlLoop() {
 	numWorkers := runtime.NumCPU()
 	if numWorkers < 1 {
@@ -481,9 +576,9 @@ func (c *Crawler) StartCrawlLoop() {

 	const fetchSize = 1000
 	for {
-		domains, err := c.GetUncheckedDomains(fetchSize)
+		domains, err := c.GetDomainsToCrawl(fetchSize)
 		if err != nil {
-			fmt.Printf("Error fetching domains: %v\n", err)
+			fmt.Printf("Error fetching domains to crawl: %v\n", err)
 		}

 		if len(domains) == 0 {
@@ -492,7 +587,7 @@ func (c *Crawler) StartCrawlLoop() {
 			continue
 		}

-		fmt.Printf("%s crawl: %d domains to check\n", time.Now().Format("15:04:05"), len(domains))
+		fmt.Printf("%s crawl: %d domains to crawl\n", time.Now().Format("15:04:05"), len(domains))

 		for _, domain := range domains {
 			workChan <- domain
@@ -15,8 +15,9 @@ import (
 const schema = `
 CREATE TABLE IF NOT EXISTS domains (
 	host TEXT PRIMARY KEY,
-	status TEXT NOT NULL DEFAULT 'unchecked',
+	status TEXT NOT NULL DEFAULT 'hold',
 	discovered_at TIMESTAMPTZ NOT NULL,
+	last_checked_at TIMESTAMPTZ,
 	last_crawled_at TIMESTAMPTZ,
 	feeds_found INTEGER DEFAULT 0,
 	last_error TEXT,
@@ -65,7 +66,7 @@ CREATE TABLE IF NOT EXISTS feeds (
 	no_update INTEGER DEFAULT 0,

 	-- Publishing to PDS
-	publish_status TEXT DEFAULT 'held' CHECK(publish_status IN ('held', 'pass', 'deny')),
+	publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')),
 	publish_account TEXT,

 	-- Full-text search vector
@@ -106,6 +107,7 @@ CREATE TABLE IF NOT EXISTS items (
 	enclosure_type TEXT,
 	enclosure_length BIGINT,
 	image_urls TEXT,  -- JSON array of image URLs
+	tags TEXT,        -- JSON array of category/tag strings

 	-- Publishing to PDS
 	published_at TIMESTAMPTZ,
@@ -25,14 +25,24 @@ services:
      - "traefik.http.routers.app-1440-news.rule=Host(`app.1440.news`)"
      - "traefik.http.routers.app-1440-news.entrypoints=https"
      - "traefik.http.routers.app-1440-news.tls.certresolver=letsencrypt-dns"
+      # Production: HTTPS for 1440.news root (accounts directory) - lower priority than PDS API paths
+      - "traefik.http.routers.root-1440-news.rule=Host(`1440.news`)"
+      - "traefik.http.routers.root-1440-news.entrypoints=https"
+      - "traefik.http.routers.root-1440-news.tls.certresolver=letsencrypt-dns"
+      - "traefik.http.routers.root-1440-news.priority=10"
      # Production: HTTPS for url.1440.news (URL shortener)
      - "traefik.http.routers.url-1440-news.rule=Host(`url.1440.news`)"
      - "traefik.http.routers.url-1440-news.entrypoints=https"
      - "traefik.http.routers.url-1440-news.tls.certresolver=letsencrypt-dns"
-      # Production: HTTP to HTTPS redirect for both domains
+      # Production: HTTP to HTTPS redirect for app and url subdomains
      - "traefik.http.routers.app-1440-news-redirect.rule=Host(`app.1440.news`) || Host(`url.1440.news`)"
      - "traefik.http.routers.app-1440-news-redirect.entrypoints=http"
      - "traefik.http.routers.app-1440-news-redirect.middlewares=https-redirect"
+      # Production: HTTP to HTTPS redirect for 1440.news root
+      - "traefik.http.routers.root-1440-news-redirect.rule=Host(`1440.news`)"
+      - "traefik.http.routers.root-1440-news-redirect.entrypoints=http"
+      - "traefik.http.routers.root-1440-news-redirect.middlewares=https-redirect"
+      - "traefik.http.routers.root-1440-news-redirect.priority=10"
      - "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
      - "traefik.http.middlewares.https-redirect.redirectscheme.permanent=true"
      # Local development: HTTP only
@@ -15,65 +15,72 @@ import (
 )

 // Domain represents a host to be crawled for feeds
+// Status: hold (pending review), pass (approved), skip (not processing), fail (error)
 type Domain struct {
 	Host          string    `json:"host"`
 	Status        string    `json:"status"`
 	DiscoveredAt  time.Time `json:"discovered_at"`
+	LastCheckedAt time.Time `json:"last_checked_at,omitempty"`
 	LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
 	FeedsFound    int       `json:"feeds_found,omitempty"`
 	LastError     string    `json:"last_error,omitempty"`
 	TLD           string    `json:"tld,omitempty"`
 }

-// shouldAutoDenyDomain checks if a domain should be auto-denied based on patterns
-func shouldAutoDenyDomain(host string) bool {
-	// Never deny our own domain
+// shouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
+func shouldAutoSkipDomain(host string) bool {
+	// Never skip our own domain
 	if strings.HasSuffix(host, "1440.news") || host == "1440.news" {
 		return false
 	}
-	// Deny domains starting with a digit (spam pattern)
+	// Skip domains starting with a digit (spam pattern)
 	if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
 		return true
 	}
+	// Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com")
+	if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
+		return true
+	}
 	return false
 }

 // saveDomain stores a domain in PostgreSQL
 func (c *Crawler) saveDomain(domain *Domain) error {
-	// Auto-deny domains matching spam patterns
+	// Auto-skip domains matching spam patterns
 	status := domain.Status
-	if shouldAutoDenyDomain(domain.Host) {
-		status = "denied"
+	if shouldAutoSkipDomain(domain.Host) {
+		status = "skip"
 	}

 	_, err := c.db.Exec(`
-		INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
-		VALUES ($1, $2, $3, $4, $5, $6, $7)
+		INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
 		ON CONFLICT(host) DO UPDATE SET
 			status = EXCLUDED.status,
+			last_checked_at = EXCLUDED.last_checked_at,
 			last_crawled_at = EXCLUDED.last_crawled_at,
 			feeds_found = EXCLUDED.feeds_found,
 			last_error = EXCLUDED.last_error,
 			tld = EXCLUDED.tld
-	`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
-		domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
+	`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
+		NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
 	return err
 }

 // saveDomainTx stores a domain using a transaction
 func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
-	// Auto-deny domains matching spam patterns
+	// Auto-skip domains matching spam patterns
 	status := domain.Status
-	if shouldAutoDenyDomain(domain.Host) {
-		status = "denied"
+	if shouldAutoSkipDomain(domain.Host) {
+		status = "skip"
 	}

 	_, err := tx.Exec(context.Background(), `
-		INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
-		VALUES ($1, $2, $3, $4, $5, $6, $7)
+		INSERT INTO domains (host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
 		ON CONFLICT(host) DO NOTHING
-	`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
-		domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
+	`, domain.Host, status, domain.DiscoveredAt, NullableTime(domain.LastCheckedAt),
+		NullableTime(domain.LastCrawledAt), domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
 	return err
 }

@@ -87,14 +94,14 @@ func (c *Crawler) domainExists(host string) bool {
 // getDomain retrieves a domain from PostgreSQL
 func (c *Crawler) getDomain(host string) (*Domain, error) {
 	domain := &Domain{}
-	var lastCrawledAt *time.Time
+	var lastCheckedAt, lastCrawledAt *time.Time
 	var lastError *string

 	err := c.db.QueryRow(`
-		SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
+		SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
 		FROM domains WHERE host = $1
 	`, normalizeHost(host)).Scan(
-		&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
+		&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
 		&domain.FeedsFound, &lastError, &domain.TLD,
 	)

@@ -105,17 +112,34 @@ func (c *Crawler) getDomain(host string) (*Domain, error) {
 		return nil, err
 	}

+	domain.LastCheckedAt = TimeValue(lastCheckedAt)
 	domain.LastCrawledAt = TimeValue(lastCrawledAt)
 	domain.LastError = StringValue(lastError)

 	return domain, nil
 }

-// GetUncheckedDomains returns up to limit unchecked domains ordered by discovered_at (FIFO)
-func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
+// GetDomainsToCheck returns domains ready for checking (status='pass', never checked)
+func (c *Crawler) GetDomainsToCheck(limit int) ([]*Domain, error) {
 	rows, err := c.db.Query(`
-		SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
-		FROM domains WHERE status = 'unchecked'
+		SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
+		FROM domains WHERE status = 'pass' AND last_checked_at IS NULL
+		ORDER BY discovered_at ASC
+		LIMIT $1
+	`, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	return c.scanDomains(rows)
+}
+
+// GetDomainsToCrawl returns domains ready for crawling (status='pass', checked but not crawled)
+func (c *Crawler) GetDomainsToCrawl(limit int) ([]*Domain, error) {
+	rows, err := c.db.Query(`
+		SELECT host, status, discovered_at, last_checked_at, last_crawled_at, feeds_found, last_error, tld
+		FROM domains WHERE status = 'pass' AND last_checked_at IS NOT NULL AND last_crawled_at IS NULL
 		ORDER BY discovered_at ASC
 		LIMIT $1
 	`, limit)
@@ -132,16 +156,17 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
 	var domains []*Domain
 	for rows.Next() {
 		domain := &Domain{}
-		var lastCrawledAt *time.Time
+		var lastCheckedAt, lastCrawledAt *time.Time
 		var lastError *string

 		if err := rows.Scan(
-			&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
+			&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCheckedAt, &lastCrawledAt,
 			&domain.FeedsFound, &lastError, &domain.TLD,
 		); err != nil {
 			continue
 		}

+		domain.LastCheckedAt = TimeValue(lastCheckedAt)
 		domain.LastCrawledAt = TimeValue(lastCrawledAt)
 		domain.LastError = StringValue(lastError)

@@ -151,36 +176,48 @@ func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
 	return domains, rows.Err()
 }

-// markDomainCrawled updates a domain's status after crawling
-func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
-	status := "checked"
+// markDomainChecked updates a domain after the check (HEAD request) stage
+func (c *Crawler) markDomainChecked(host string, lastError string) error {
+	now := time.Now()
 	if lastError != "" {
-		status = "error"
-	}
-
-	var err error
-	if lastError != "" {
-		_, err = c.db.Exec(`
-			UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = $4
-			WHERE host = $5
-		`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
-	} else {
-		_, err = c.db.Exec(`
-			UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = NULL
-			WHERE host = $4
-		`, status, time.Now(), feedsFound, normalizeHost(host))
+		_, err := c.db.Exec(`
+			UPDATE domains SET status = 'fail', last_checked_at = $1, last_error = $2
+			WHERE host = $3
+		`, now, lastError, normalizeHost(host))
+		return err
 	}
+	_, err := c.db.Exec(`
+		UPDATE domains SET last_checked_at = $1, last_error = NULL
+		WHERE host = $2
+	`, now, normalizeHost(host))
 	return err
 }

-// GetDomainCount returns the total number of domains in the database
-func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
+// markDomainCrawled updates a domain after the crawl stage
+func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
+	now := time.Now()
+	if lastError != "" {
+		_, err := c.db.Exec(`
+			UPDATE domains SET status = 'fail', last_crawled_at = $1, feeds_found = $2, last_error = $3
+			WHERE host = $4
+		`, now, feedsFound, lastError, normalizeHost(host))
+		return err
+	}
+	_, err := c.db.Exec(`
+		UPDATE domains SET last_crawled_at = $1, feeds_found = $2, last_error = NULL
+		WHERE host = $3
+	`, now, feedsFound, normalizeHost(host))
+	return err
+}
+
+// GetDomainCount returns the total number of domains and counts by status
+func (c *Crawler) GetDomainCount() (total int, hold int, err error) {
 	err = c.db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&total)
 	if err != nil {
 		return 0, 0, err
 	}
-	err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'unchecked'").Scan(&unchecked)
-	return total, unchecked, err
+	err = c.db.QueryRow("SELECT COUNT(*) FROM domains WHERE status = 'hold'").Scan(&hold)
+	return total, hold, err
 }

 // ImportTestDomains adds a list of specific domains for testing
@@ -189,7 +226,7 @@ func (c *Crawler) ImportTestDomains(domains []string) {
 	for _, host := range domains {
 		_, err := c.db.Exec(`
 			INSERT INTO domains (host, status, discovered_at, tld)
-			VALUES ($1, 'unchecked', $2, $3)
+			VALUES ($1, 'hold', $2, $3)
 			ON CONFLICT(host) DO NOTHING
 		`, host, now, getTLD(host))
 		if err != nil {
@@ -200,7 +237,7 @@ func (c *Crawler) ImportTestDomains(domains []string) {
 	}
 }

-// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
+// ImportDomainsFromFile reads a vertices file and stores new domains as "hold"
 func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
 	file, err := os.Open(filename)
 	if err != nil {
@@ -246,7 +283,7 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
 		buf := make([]byte, 0, 64*1024)
 		scanner.Buffer(buf, 1024*1024)

-		const batchSize = 1000
+		const batchSize = 100000
 		now := time.Now()
 		totalImported := 0
 		batchCount := 0
@@ -284,12 +321,12 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
 				break
 			}

-			// Build rows for copy, applying auto-deny for spam patterns
+			// Build rows for copy, applying auto-skip for spam patterns
 			rows := make([][]interface{}, len(domains))
 			for i, d := range domains {
-				status := "unchecked"
-				if shouldAutoDenyDomain(d.host) {
-					status = "denied"
+				status := "hold"
+				if shouldAutoSkipDomain(d.host) {
+					status = "skip"
 				}
 				rows[i] = []interface{}{d.host, status, now, d.tld}
 			}
@@ -306,9 +343,9 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
 			if err != nil {
 				// Fall back to individual inserts with ON CONFLICT
 				for _, d := range domains {
-					status := "unchecked"
-					if shouldAutoDenyDomain(d.host) {
-						status = "denied"
+					status := "hold"
+					if shouldAutoSkipDomain(d.host) {
+						status = "skip"
 					}
 					c.db.Exec(`
 						INSERT INTO domains (host, status, discovered_at, tld)
@@ -361,7 +398,7 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in

 	now := time.Now()
 	count := 0
-	const batchSize = 1000
+	const batchSize = 100000

 	type domainEntry struct {
 		host string
@@ -391,11 +428,11 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
 			break
 		}

-		// Insert with ON CONFLICT, applying auto-deny for spam patterns
+		// Insert with ON CONFLICT, applying auto-skip for spam patterns
 		for _, d := range domains {
-			status := "unchecked"
-			if shouldAutoDenyDomain(d.host) {
-				status = "denied"
+			status := "hold"
+			if shouldAutoSkipDomain(d.host) {
+				status = "skip"
 			}
 			result, err := c.db.Exec(`
 				INSERT INTO domains (host, status, discovered_at, tld)
@@ -119,6 +119,7 @@ type Item struct {
 	// Media attachments
 	Enclosure *Enclosure `json:"enclosure,omitempty"`  // Primary enclosure (podcast audio, etc.)
 	ImageURLs []string   `json:"image_urls,omitempty"` // Image URLs extracted from content
+	Tags      []string   `json:"tags,omitempty"`       // Category/tag strings from feed

 	// Publishing to PDS
 	PublishedAt  time.Time `json:"published_at,omitempty"`
@@ -171,25 +172,25 @@ type Feed struct {
 	NoUpdate int `json:"no_update"` // Consecutive checks with no change

 	// Publishing to PDS
-	PublishStatus  string `json:"publish_status"`            // "held", "pass", "deny"
+	PublishStatus  string `json:"publish_status"`            // "hold", "pass", "skip"
 	PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
 }

 // saveFeed stores a feed in PostgreSQL
 func (c *Crawler) saveFeed(feed *Feed) error {
-	// Default publishStatus to "held" if not set
-	// Auto-deny feeds with no language or unsupported type
+	// Default publishStatus to "hold" if not set
+	// Auto-skip feeds with no language or non-English language
 	// Auto-pass feeds from our own domain
 	publishStatus := feed.PublishStatus
 	if publishStatus == "" {
 		if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
 			publishStatus = "pass"
-		} else if feed.Language == "" {
-			publishStatus = "deny"
+		} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
+			publishStatus = "skip"
 		} else if feed.Type != "rss" && feed.Type != "atom" && feed.Type != "json" {
-			publishStatus = "deny"
+			publishStatus = "skip"
 		} else {
-			publishStatus = "held"
+			publishStatus = "hold"
 		}
 	}

@@ -330,7 +331,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
 	if publishStatus != nil {
 		feed.PublishStatus = *publishStatus
 	} else {
-		feed.PublishStatus = "held"
+		feed.PublishStatus = "hold"
 	}
 	feed.PublishAccount = StringValue(publishAccount)

@@ -526,7 +527,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
 		if publishStatus != nil {
 			feed.PublishStatus = *publishStatus
 		} else {
-			feed.PublishStatus = "held"
+			feed.PublishStatus = "hold"
 		}
 		feed.PublishAccount = StringValue(publishAccount)

@@ -558,10 +559,19 @@ func (c *Crawler) saveItem(item *Item) error {
 		}
 	}

+	// Serialize tags as JSON
+	var tagsJSON *string
+	if len(item.Tags) > 0 {
+		if data, err := json.Marshal(item.Tags); err == nil {
+			s := string(data)
+			tagsJSON = &s
+		}
+	}
+
 	_, err := c.db.Exec(`
 		INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
-			enclosure_url, enclosure_type, enclosure_length, image_urls)
-		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
+			enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
 		ON CONFLICT(feed_url, guid) DO UPDATE SET
 			title = EXCLUDED.title,
 			link = EXCLUDED.link,
@@ -573,12 +583,13 @@ func (c *Crawler) saveItem(item *Item) error {
 			enclosure_url = EXCLUDED.enclosure_url,
 			enclosure_type = EXCLUDED.enclosure_type,
 			enclosure_length = EXCLUDED.enclosure_length,
-			image_urls = EXCLUDED.image_urls
+			image_urls = EXCLUDED.image_urls,
+			tags = EXCLUDED.tags
 	`,
 		item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
 		NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
 		NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
-		enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
+		enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
 	)
 	return err
 }
@@ -620,10 +631,19 @@ func (c *Crawler) saveItems(items []*Item) error {
 			}
 		}

+		// Serialize tags as JSON
+		var tagsJSON *string
+		if len(item.Tags) > 0 {
+			if data, err := json.Marshal(item.Tags); err == nil {
+				s := string(data)
+				tagsJSON = &s
+			}
+		}
+
 		_, err := tx.Exec(context.Background(), `
 			INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
-				enclosure_url, enclosure_type, enclosure_length, image_urls)
-			VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
+				enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
+			VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
 			ON CONFLICT(feed_url, guid) DO UPDATE SET
 				title = EXCLUDED.title,
 				link = EXCLUDED.link,
@@ -635,12 +655,13 @@ func (c *Crawler) saveItems(items []*Item) error {
 				enclosure_url = EXCLUDED.enclosure_url,
 				enclosure_type = EXCLUDED.enclosure_type,
 				enclosure_length = EXCLUDED.enclosure_length,
-				image_urls = EXCLUDED.image_urls
+				image_urls = EXCLUDED.image_urls,
+				tags = EXCLUDED.tags
 		`,
 			item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
 			NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
 			NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
-			enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
+			enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
 		)
 		if err != nil {
 			continue // Skip failed items
@@ -654,7 +675,7 @@ func (c *Crawler) saveItems(items []*Item) error {
 func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
 	rows, err := c.db.Query(`
 		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
-			enclosure_url, enclosure_type, enclosure_length, image_urls,
+			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
 			published_at, published_uri
 		FROM items
 		WHERE feed_url = $1
@@ -674,7 +695,7 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
 	tsquery := ToSearchQuery(query)
 	rows, err := c.db.Query(`
 		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
-			enclosure_url, enclosure_type, enclosure_length, image_urls,
+			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
 			published_at, published_uri
 		FROM items
 		WHERE search_vector @@ to_tsquery('english', $1)
@@ -698,14 +719,14 @@ func scanItems(rows pgx.Rows) ([]*Item, error) {
 		var pubDate, updatedAt, publishedAt *time.Time
 		var enclosureUrl, enclosureType *string
 		var enclosureLength *int64
-		var imageUrlsJSON *string
+		var imageUrlsJSON, tagsJSON *string
 		var publishedUri *string

 		if err := rows.Scan(
 			&item.ID, &item.FeedURL, &guid, &title, &link,
 			&description, &content, &author, &pubDate,
 			&item.DiscoveredAt, &updatedAt,
-			&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
+			&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
 			&publishedAt, &publishedUri,
 		); err != nil {
 			continue
@@ -739,6 +760,14 @@ func scanItems(rows pgx.Rows) ([]*Item, error) {
 			}
 		}

+		// Parse tags JSON
+		if tagsJSON != nil && *tagsJSON != "" {
+			var tags []string
+			if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
+				item.Tags = tags
+			}
+		}
+
 		item.PublishedAt = TimeValue(publishedAt)
 		item.PublishedUri = StringValue(publishedUri)

@@ -907,6 +936,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
 		feed.LastError = err.Error()
 		feed.LastErrorAt = now
 		feed.Status = "error"
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
 		c.saveFeed(feed)
 		return false, err
 	}
@@ -939,6 +973,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
 		} else {
 			feed.Status = "error"
 		}
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
 		c.saveFeed(feed)
 		return false, nil
 	}
@@ -952,6 +991,11 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
 		feed.LastError = err.Error()
 		feed.LastErrorAt = now
 		feed.Status = "error"
+		// Auto-hold feeds that fail 100+ times
+		if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
+			feed.PublishStatus = "hold"
+			fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
+		}
 		c.saveFeed(feed)
 		return false, err
 	}
@@ -992,7 +1036,7 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
 	return true, nil
 }

-// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'deny')
+// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
 // If status is 'pass', the account handle is also set (auto-derived if empty)
 func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
 	feedURL = normalizeURL(feedURL)
@@ -1031,7 +1075,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
 	return scanFeeds(rows)
 }

-// GetPublishCandidates returns feeds that are held for review and have items
+// GetPublishCandidates returns feeds that are hold for review and have items
 func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
 	rows, err := c.db.Query(`
 		SELECT url, type, category, title, description, language, site_url,
@@ -1044,7 +1088,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
 			no_update,
 			publish_status, publish_account
 		FROM feeds
-		WHERE publish_status = 'held' AND item_count > 0 AND status = 'active'
+		WHERE publish_status = 'hold' AND item_count > 0 AND status = 'active'
 		ORDER BY item_count DESC
 		LIMIT $1
 	`, limit)
@@ -1060,7 +1104,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
 func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
 	rows, err := c.db.Query(`
 		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
-			enclosure_url, enclosure_type, enclosure_length, image_urls,
+			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
 			published_at, published_uri
 		FROM items
 		WHERE feed_url = $1 AND published_at IS NULL
@@ -56,7 +56,10 @@ func main() {
 	// Publish loop (background) - autopublishes items for approved feeds
 	go crawler.StartPublishLoop()

-	// Crawl loop (background)
+	// Domain check loop (background) - verifies approved domains are reachable
+	go crawler.StartDomainCheckLoop()
+
+	// Crawl loop (background) - crawls checked domains for feeds
 	go crawler.StartCrawlLoop()

 	// Wait for shutdown signal
@@ -41,6 +41,7 @@ type RSSItem struct {
 	Author      string        `xml:"author"`
 	Creator     string        `xml:"http://purl.org/dc/elements/1.1/ creator"`
 	PubDate     string        `xml:"pubDate"`
+	Categories  []string      `xml:"category"`
 	Enclosure   *RSSEnclosure `xml:"enclosure"`
 	// iTunes item elements
 	ITunesDuration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
@@ -82,14 +83,20 @@ type AtomFeed struct {
 }

 type AtomEntry struct {
-	ID        string      `xml:"id"`
-	Title     string      `xml:"title"`
-	Links     []AtomLink  `xml:"link"`
-	Summary   string      `xml:"summary"`
-	Content   AtomContent `xml:"content"`
-	Author    AtomAuthor  `xml:"author"`
-	Updated   string      `xml:"updated"`
-	Published string      `xml:"published"`
+	ID         string         `xml:"id"`
+	Title      string         `xml:"title"`
+	Links      []AtomLink     `xml:"link"`
+	Summary    string         `xml:"summary"`
+	Content    AtomContent    `xml:"content"`
+	Author     AtomAuthor     `xml:"author"`
+	Updated    string         `xml:"updated"`
+	Published  string         `xml:"published"`
+	Categories []AtomCategory `xml:"category"`
+}
+
+type AtomCategory struct {
+	Term  string `xml:"term,attr"`
+	Label string `xml:"label,attr"`
 }

 type AtomContent struct {
@@ -222,6 +229,11 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
 		// Extract images from various sources
 		item.ImageURLs = extractItemImages(rssItem)

+		// Extract categories/tags
+		if len(rssItem.Categories) > 0 {
+			item.Tags = rssItem.Categories
+		}
+
 		items = append(items, item)
 	}

@@ -324,6 +336,20 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
 			}
 		}

+		// Extract categories/tags
+		if len(entry.Categories) > 0 {
+			for _, cat := range entry.Categories {
+				// Prefer label, fall back to term
+				tag := cat.Label
+				if tag == "" {
+					tag = cat.Term
+				}
+				if tag != "" {
+					item.Tags = append(item.Tags, tag)
+				}
+			}
+		}
+
 		items = append(items, item)
 	}

@@ -514,16 +540,17 @@ type JSONFeed struct {
 }

 type JSONFeedItem struct {
-	ID            string           `json:"id"`
-	URL           string           `json:"url"`
-	Title         string           `json:"title"`
-	ContentHTML   string           `json:"content_html"`
-	ContentText   string           `json:"content_text"`
-	Summary       string           `json:"summary"`
-	Image         string           `json:"image"`
-	DatePublished string           `json:"date_published"`
-	DateModified  string           `json:"date_modified"`
-	Authors       []JSONFeedAuthor `json:"authors"`
+	ID            string               `json:"id"`
+	URL           string               `json:"url"`
+	Title         string               `json:"title"`
+	ContentHTML   string               `json:"content_html"`
+	ContentText   string               `json:"content_text"`
+	Summary       string               `json:"summary"`
+	Image         string               `json:"image"`
+	DatePublished string               `json:"date_published"`
+	DateModified  string               `json:"date_modified"`
+	Authors       []JSONFeedAuthor     `json:"authors"`
+	Tags          []string             `json:"tags"`
 	Attachments   []JSONFeedAttachment `json:"attachments"`
 }

@@ -600,6 +627,11 @@ func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
 			item.ImageURLs = []string{ji.Image}
 		}

+		// Tags
+		if len(ji.Tags) > 0 {
+			item.Tags = ji.Tags
+		}
+
 		// Attachments (enclosures)
 		for _, att := range ji.Attachments {
 			if att.URL != "" {
@@ -12,13 +12,14 @@ import (
 	"io"
 	"net/http"
 	"net/url"
+	"os"
 	"regexp"
 	"strings"
 	"time"
-	"unicode/utf8"

-	_ "golang.org/x/image/webp"
+	"go.deanishe.net/favicon"
 	"golang.org/x/image/draw"
+	_ "golang.org/x/image/webp"
 )

 // Publisher handles posting items to AT Protocol PDS
@@ -29,24 +30,24 @@ type Publisher struct {

 // PDSSession holds authentication info for a PDS account
 type PDSSession struct {
-	DID         string `json:"did"`
-	Handle      string `json:"handle"`
-	AccessJwt   string `json:"accessJwt"`
-	RefreshJwt  string `json:"refreshJwt"`
+	DID        string `json:"did"`
+	Handle     string `json:"handle"`
+	AccessJwt  string `json:"accessJwt"`
+	RefreshJwt string `json:"refreshJwt"`
 }

 // BskyPost represents an app.bsky.feed.post record
 type BskyPost struct {
-	Type      string          `json:"$type"`
-	Text      string          `json:"text"`
-	CreatedAt string          `json:"createdAt"`
-	Facets    []BskyFacet     `json:"facets,omitempty"`
-	Embed     *BskyEmbed      `json:"embed,omitempty"`
+	Type      string      `json:"$type"`
+	Text      string      `json:"text"`
+	CreatedAt string      `json:"createdAt"`
+	Facets    []BskyFacet `json:"facets,omitempty"`
+	Embed     *BskyEmbed  `json:"embed,omitempty"`
 }

 type BskyFacet struct {
-	Index    BskyByteSlice   `json:"index"`
-	Features []BskyFeature   `json:"features"`
+	Index    BskyByteSlice `json:"index"`
+	Features []BskyFeature `json:"features"`
 }

 type BskyByteSlice struct {
@@ -57,12 +58,13 @@ type BskyByteSlice struct {
 type BskyFeature struct {
 	Type string `json:"$type"`
 	URI  string `json:"uri,omitempty"`
+	Tag  string `json:"tag,omitempty"` // For hashtag facets
 }

 type BskyEmbed struct {
-	Type     string         `json:"$type"`
-	External *BskyExternal  `json:"external,omitempty"`
-	Images   []BskyImage    `json:"images,omitempty"`
+	Type     string        `json:"$type"`
+	External *BskyExternal `json:"external,omitempty"`
+	Images   []BskyImage   `json:"images,omitempty"`
 }

 type BskyExternal struct {
@@ -73,9 +75,9 @@ type BskyExternal struct {
 }

 type BskyImage struct {
-	Alt         string            `json:"alt"`
-	Image       *BlobRef          `json:"image"`
-	AspectRatio *BskyAspectRatio  `json:"aspectRatio,omitempty"`
+	Alt         string           `json:"alt"`
+	Image       *BlobRef         `json:"image"`
+	AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"`
 }

 type BskyAspectRatio struct {
@@ -209,6 +211,66 @@ func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string
 	return result.Code, nil
 }

+// FollowAccount creates a follow record from the authenticated session to the target DID
+func (p *Publisher) FollowAccount(session *PDSSession, targetDID string) error {
+	// Create follow record
+	now := time.Now().UTC().Format(time.RFC3339)
+	record := map[string]interface{}{
+		"$type":     "app.bsky.graph.follow",
+		"subject":   targetDID,
+		"createdAt": now,
+	}
+
+	payload := map[string]interface{}{
+		"repo":       session.DID,
+		"collection": "app.bsky.graph.follow",
+		"record":     record,
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+
+	req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
+
+	resp, err := p.httpClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("follow failed: %s - %s", resp.Status, string(respBody))
+	}
+
+	return nil
+}
+
+// FollowAsDirectory logs in as the directory account and follows the target DID
+func (p *Publisher) FollowAsDirectory(targetDID string) error {
+	dirHandle := os.Getenv("DIRECTORY_HANDLE")
+	dirPassword := os.Getenv("DIRECTORY_PASSWORD")
+
+	if dirHandle == "" || dirPassword == "" {
+		// Silently skip if directory account not configured
+		return nil
+	}
+
+	session, err := p.CreateSession(dirHandle, dirPassword)
+	if err != nil {
+		return fmt.Errorf("directory login failed: %w", err)
+	}
+
+	return p.FollowAccount(session, targetDID)
+}
+
 // TID alphabet for base32-sortable encoding
 const tidAlphabet = "234567abcdefghijklmnopqrstuvwxyz"

@@ -268,6 +330,116 @@ func extractURLs(text string) []string {
 	return urls
 }

+// toCamelCaseTag converts a tag string to camelCase hashtag format
+// e.g., "Lagos News" -> "lagosNews", "AI" -> "ai", "machine learning" -> "machineLearning"
+func toCamelCaseTag(tag string) string {
+	tag = strings.TrimSpace(tag)
+	if tag == "" {
+		return ""
+	}
+
+	// Remove any # prefix if present
+	tag = strings.TrimPrefix(tag, "#")
+
+	// Split on spaces and other separators
+	words := strings.FieldsFunc(tag, func(r rune) bool {
+		return r == ' ' || r == '-' || r == '_'
+	})
+
+	if len(words) == 0 {
+		return ""
+	}
+
+	// If single word, return lowercased
+	if len(words) == 1 {
+		return strings.ToLower(words[0])
+	}
+
+	// Multiple words: lowercase first word, capitalize first letter of subsequent words
+	var result strings.Builder
+	for i, word := range words {
+		if word == "" {
+			continue
+		}
+		runes := []rune(word)
+		if len(runes) > 0 {
+			if i == 0 || result.Len() == 0 {
+				// First word: all lowercase
+				result.WriteString(strings.ToLower(word))
+			} else {
+				// Subsequent words: capitalize first letter, lowercase rest
+				result.WriteString(strings.ToUpper(string(runes[0])))
+				if len(runes) > 1 {
+					result.WriteString(strings.ToLower(string(runes[1:])))
+				}
+			}
+		}
+	}
+	return result.String()
+}
+
+// formatTagsForPost converts item tags to hashtag text and facets
+// Returns the hashtag line (e.g., "#AI #MachineLearning #News") and facets
+func formatTagsForPost(tags []string, textOffset int) (string, []BskyFacet) {
+	if len(tags) == 0 {
+		return "", nil
+	}
+
+	// Dedupe and convert tags
+	seen := make(map[string]bool)
+	var hashtags []string
+	for _, tag := range tags {
+		camel := toCamelCaseTag(tag)
+		if camel == "" || seen[strings.ToLower(camel)] {
+			continue
+		}
+		seen[strings.ToLower(camel)] = true
+		hashtags = append(hashtags, camel)
+	}
+
+	if len(hashtags) == 0 {
+		return "", nil
+	}
+
+	// Limit to 5 tags to keep post compact
+	if len(hashtags) > 5 {
+		hashtags = hashtags[:5]
+	}
+
+	// Build the hashtag line and facets
+	var line strings.Builder
+	var facets []BskyFacet
+	currentOffset := textOffset
+
+	for i, ht := range hashtags {
+		if i > 0 {
+			line.WriteString(" ")
+			currentOffset++
+		}
+
+		hashtagText := "#" + ht
+		byteStart := currentOffset
+		byteEnd := currentOffset + len(hashtagText)
+
+		line.WriteString(hashtagText)
+
+		facets = append(facets, BskyFacet{
+			Index: BskyByteSlice{
+				ByteStart: byteStart,
+				ByteEnd:   byteEnd,
+			},
+			Features: []BskyFeature{{
+				Type: "app.bsky.richtext.facet#tag",
+				Tag:  ht,
+			}},
+		})
+
+		currentOffset = byteEnd
+	}
+
+	return line.String(), facets
+}
+
 // PublishItem posts a feed item to the PDS
 // Returns the AT URI of the created record, or error
 func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
@@ -316,81 +488,11 @@ func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error)
 		}
 	}

-	// Build post text: title + link labels
-	// Bluesky has 300 grapheme limit - use rune count as approximation
-	const maxGraphemes = 295 // Leave some margin
-
-	// Create labeled links: "Article", "Audio", etc.
-	type labeledLink struct {
-		Label string
-		URL   string
+	// Get the primary URL (article link)
+	primaryURL := ""
+	if len(allURLs) > 0 {
+		primaryURL = allURLs[0]
 	}
-	var links []labeledLink
-
-	for i, u := range allURLs {
-		if i == 0 {
-			// First URL is the article link
-			links = append(links, labeledLink{Label: "Article", URL: u})
-		} else if item.Enclosure != nil && u == item.Enclosure.URL {
-			// Enclosure URL - label based on type
-			encType := strings.ToLower(item.Enclosure.Type)
-			if strings.HasPrefix(encType, "audio/") {
-				links = append(links, labeledLink{Label: "Audio", URL: u})
-			} else if strings.HasPrefix(encType, "video/") {
-				links = append(links, labeledLink{Label: "Video", URL: u})
-			} else {
-				links = append(links, labeledLink{Label: "Media", URL: u})
-			}
-		} else if strings.Contains(u, "news.ycombinator.com") {
-			links = append(links, labeledLink{Label: "Comments", URL: u})
-		} else {
-			links = append(links, labeledLink{Label: "Link", URL: u})
-		}
-	}
-
-	// Calculate space needed for labels (in runes)
-	// Format: "Article · Audio" or just "Article"
-	labelSpace := 0
-	for i, link := range links {
-		labelSpace += utf8.RuneCountInString(link.Label)
-		if i > 0 {
-			labelSpace += 3 // " · " separator
-		}
-	}
-	labelSpace += 2 // \n\n before labels
-
-	// Truncate title if needed
-	title := item.Title
-	titleRunes := utf8.RuneCountInString(title)
-	maxTitleRunes := maxGraphemes - labelSpace - 3 // -3 for "..."
-
-	if titleRunes+labelSpace > maxGraphemes {
-		if maxTitleRunes > 10 {
-			runes := []rune(title)
-			if len(runes) > maxTitleRunes {
-				title = string(runes[:maxTitleRunes]) + "..."
-			}
-		} else {
-			runes := []rune(title)
-			if len(runes) > 50 {
-				title = string(runes[:50]) + "..."
-			}
-		}
-	}
-
-	// Build final text with labels
-	var textBuilder strings.Builder
-	textBuilder.WriteString(title)
-	if len(links) > 0 {
-		textBuilder.WriteString("\n\n")
-		for i, link := range links {
-			if i > 0 {
-				textBuilder.WriteString(" · ")
-			}
-			textBuilder.WriteString(link.Label)
-		}
-	}
-	text := textBuilder.String()

 	// Use original publication date if available, otherwise current time
 	createdAt := time.Now()
@@ -398,60 +500,34 @@ func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error)
 		createdAt = item.PubDate
 	}

+	// Build post text with hashtags if available
+	// The link card shows the title, description, and thumbnail
+	// Clicking the card doesn't trigger the "leaving Bluesky" warning
+	postText := ""
+	var facets []BskyFacet
+
+	if len(item.Tags) > 0 {
+		tagLine, tagFacets := formatTagsForPost(item.Tags, 0)
+		postText = tagLine
+		facets = tagFacets
+	}
+
 	post := BskyPost{
 		Type:      "app.bsky.feed.post",
-		Text:      text,
+		Text:      postText,
 		CreatedAt: createdAt.Format(time.RFC3339),
+		Facets:    facets,
 	}

-	// Add facets for labeled links
-	// Find each label in the text and create a facet linking to its URL
-	searchPos := len(title) + 2 // Start after title + \n\n
-	for _, link := range links {
-		labelStart := strings.Index(text[searchPos:], link.Label)
-		if labelStart >= 0 {
-			labelStart += searchPos
-			byteStart := len(text[:labelStart])
-			byteEnd := byteStart + len(link.Label)
-
-			post.Facets = append(post.Facets, BskyFacet{
-				Index: BskyByteSlice{
-					ByteStart: byteStart,
-					ByteEnd:   byteEnd,
-				},
-				Features: []BskyFeature{
-					{
-						Type: "app.bsky.richtext.facet#link",
-						URI:  link.URL,
-					},
-				},
-			})
-			searchPos = labelStart + len(link.Label)
-		}
-	}
-
-	// Decide embed type based on content
-	// Priority: images > external link card
-	if len(item.ImageURLs) > 0 {
-		// Try to upload images (up to 4)
-		uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
-		if len(uploadedImages) > 0 {
-			post.Embed = &BskyEmbed{
-				Type:   "app.bsky.embed.images",
-				Images: uploadedImages,
-			}
-		}
-	}
-
-	// Fall back to external embed if no images were uploaded
-	if post.Embed == nil && len(allURLs) > 0 {
+	// Always use external embed (link card) - clicking the card doesn't show "leaving" warning
+	if primaryURL != "" {
 		external := &BskyExternal{
-			URI:         allURLs[0],
+			URI:         primaryURL,
 			Title:       item.Title,
 			Description: truncate(stripHTML(item.Description), 300),
 		}

-		// Try to add thumbnail from first image
+		// Add thumbnail from first image if available
 		if len(item.ImageURLs) > 0 {
 			if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
 				external.Thumb = thumb
@@ -548,15 +624,15 @@ func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altTex
 	return images
 }

-// fetchAndUploadImage downloads an image and uploads it to the PDS
 // FetchFavicon tries to get a favicon URL for a site
+// Uses go.deanishe.net/favicon library which parses HTML, manifests, and checks common paths
 // Returns the favicon URL or empty string if not found
 func (p *Publisher) FetchFavicon(siteURL string) string {
 	if siteURL == "" {
 		return ""
 	}

-	// Parse the site URL to get the host
+	// Ensure URL has scheme
 	if !strings.Contains(siteURL, "://") {
 		siteURL = "https://" + siteURL
 	}
@@ -565,23 +641,81 @@ func (p *Publisher) FetchFavicon(siteURL string) string {
 		return ""
 	}

-	// Try common favicon locations
-	faviconURLs := []string{
-		fmt.Sprintf("https://%s/favicon.ico", u.Host),
-		fmt.Sprintf("https://%s/favicon.png", u.Host),
-		fmt.Sprintf("https://%s/apple-touch-icon.png", u.Host),
-	}
+	// Create finder with custom HTTP client
+	// Note: Don't use IgnoreNoSize as it filters out valid favicon.ico files that don't have size metadata
+	finder := favicon.New(
+		favicon.WithClient(p.httpClient),
+	)

-	for _, faviconURL := range faviconURLs {
-		resp, err := p.httpClient.Head(faviconURL)
-		if err != nil {
-			continue
+	// Find icons - library checks HTML <link> tags, manifests, OG images, common paths
+	icons, err := finder.Find(siteURL)
+	if err == nil && len(icons) > 0 {
+		// Filter and score icons for avatar use
+		// Prefer: square icons, reasonable size, PNG format, actual favicons over OG images
+		var bestIcon string
+		var bestScore int
+
+		for _, icon := range icons {
+			// Skip tiny icons (likely tracking pixels)
+			if icon.Width > 0 && icon.Width < 32 {
+				continue
+			}
+
+			// Skip Open Graph images (meant for link previews, usually wide banners)
+			lowerURL := strings.ToLower(icon.URL)
+			if strings.Contains(lowerURL, "og-image") || strings.Contains(lowerURL, "og_image") ||
+				strings.Contains(lowerURL, "opengraph") || strings.Contains(lowerURL, "twitter") {
+				continue
+			}
+
+			// Skip wide images (aspect ratio > 1.5 means it's a banner, not an icon)
+			if icon.Width > 0 && icon.Height > 0 {
+				ratio := float64(icon.Width) / float64(icon.Height)
+				if ratio > 1.5 || ratio < 0.67 {
+					continue
+				}
+			}
+
+			// Score the icon
+			score := 0
+
+			// Prefer actual favicon paths
+			if strings.Contains(lowerURL, "favicon") || strings.Contains(lowerURL, "icon") ||
+				strings.Contains(lowerURL, "apple-touch") {
+				score += 100
+			}
+
+			// Prefer PNG over other formats
+			if icon.MimeType == "image/png" {
+				score += 50
+			} else if icon.MimeType == "image/x-icon" || strings.HasSuffix(lowerURL, ".ico") {
+				score += 40
+			} else if icon.MimeType == "image/jpeg" {
+				score += 10 // JPEG less preferred for icons
+			}
+
+			// Prefer larger icons (but not too large)
+			if icon.Width >= 64 && icon.Width <= 512 {
+				score += 30
+			} else if icon.Width > 0 {
+				score += 10
+			}
+
+			if score > bestScore {
+				bestScore = score
+				bestIcon = icon.URL
+			}
 		}
-		resp.Body.Close()
-		if resp.StatusCode == http.StatusOK {
-			contentType := resp.Header.Get("Content-Type")
-			if strings.HasPrefix(contentType, "image/") || strings.HasSuffix(faviconURL, ".ico") {
-				return faviconURL
+
+		if bestIcon != "" {
+			return bestIcon
+		}
+
+		// Fall back to first non-OG icon
+		for _, icon := range icons {
+			lowerURL := strings.ToLower(icon.URL)
+			if !strings.Contains(lowerURL, "og-image") && !strings.Contains(lowerURL, "og_image") {
+				return icon.URL
 			}
 		}
 	}
@@ -763,8 +897,9 @@ func stripHTML(s string) string {
 // AT Protocol allows up to 63 characters per label, but the PDS
 // restricts the first segment to 18 characters for local handles.
 // Examples:
-//   feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
-//   news.ycombinator.com/rss → ycombinator.1440.news
+//
+//	feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
+//	news.ycombinator.com/rss → ycombinator.1440.news
 func DeriveHandleFromFeed(feedURL string) string {
 	const maxSubdomainLen = 18 // PDS limit for first segment

@@ -792,7 +927,7 @@ func DeriveHandleFromFeed(feedURL string) string {
 	skipPathWords := map[string]bool{
 		"rss": true, "feed": true, "feeds": true, "atom": true,
 		"xml": true, "default": true, "index": true, "services": true,
-		"nyt": true, "blog": true,
+		"nyt": true,
 	}

 	var pathParts []string
@@ -1274,8 +1409,8 @@ func (p *Publisher) DeleteAccount(adminPassword, did string) error {
 }

 // FetchFavicon downloads a favicon/icon from a URL
+// Uses go.deanishe.net/favicon library to find the best icon
 func FetchFavicon(siteURL string) ([]byte, string, error) {
-	// Try common favicon locations
 	if !strings.HasPrefix(siteURL, "http") {
 		siteURL = "https://" + siteURL
 	}
@@ -1285,48 +1420,83 @@ func FetchFavicon(siteURL string) ([]byte, string, error) {
 		return nil, "", err
 	}

-	baseURL := u.Scheme + "://" + u.Host
-
-	// Try apple-touch-icon first (usually higher quality)
-	iconURLs := []string{
-		baseURL + "/apple-touch-icon.png",
-		baseURL + "/apple-touch-icon-precomposed.png",
-		baseURL + "/favicon.png",
-		baseURL + "/favicon.ico",
-	}
-
 	client := &http.Client{Timeout: 10 * time.Second}

-	for _, iconURL := range iconURLs {
-		resp, err := client.Get(iconURL)
-		if err != nil {
-			continue
-		}
-		defer resp.Body.Close()
+	// Use favicon library to find icons
+	finder := favicon.New(
+		favicon.WithClient(client),
+		favicon.IgnoreNoSize,
+	)

-		if resp.StatusCode != http.StatusOK {
-			continue
-		}
-
-		data, err := io.ReadAll(resp.Body)
-		if err != nil {
-			continue
-		}
-
-		// Determine mime type
-		contentType := resp.Header.Get("Content-Type")
-		if contentType == "" {
-			if strings.HasSuffix(iconURL, ".png") {
-				contentType = "image/png"
-			} else if strings.HasSuffix(iconURL, ".ico") {
-				contentType = "image/x-icon"
-			} else {
-				contentType = "image/png" // default
-			}
-		}
-
-		return data, contentType, nil
+	icons, err := finder.Find(siteURL)
+	if err != nil || len(icons) == 0 {
+		// Fallback to Google's favicon service
+		googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
+		return fetchIconBytes(client, googleURL)
 	}

-	return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
+	// Try icons in order (sorted by size, largest first)
+	// Prefer PNG/JPEG over ICO
+	var iconURLs []string
+	for _, icon := range icons {
+		if icon.Width > 0 && icon.Width < 32 {
+			continue // Skip tiny icons
+		}
+		if icon.MimeType == "image/png" || icon.MimeType == "image/jpeg" {
+			iconURLs = append([]string{icon.URL}, iconURLs...) // Prepend PNG/JPEG
+		} else {
+			iconURLs = append(iconURLs, icon.URL)
+		}
+	}
+
+	// If no good icons, use all of them
+	if len(iconURLs) == 0 {
+		for _, icon := range icons {
+			iconURLs = append(iconURLs, icon.URL)
+		}
+	}
+
+	// Try to download each icon
+	for _, iconURL := range iconURLs {
+		data, mimeType, err := fetchIconBytes(client, iconURL)
+		if err == nil && len(data) > 0 {
+			return data, mimeType, nil
+		}
+	}
+
+	// Final fallback to Google
+	googleURL := fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
+	return fetchIconBytes(client, googleURL)
+}
+
+// fetchIconBytes downloads an icon and returns its bytes and mime type
+func fetchIconBytes(client *http.Client, iconURL string) ([]byte, string, error) {
+	resp, err := client.Get(iconURL)
+	if err != nil {
+		return nil, "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, "", fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+
+	data, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, "", err
+	}
+
+	// Determine mime type
+	contentType := resp.Header.Get("Content-Type")
+	if contentType == "" {
+		if strings.HasSuffix(iconURL, ".png") {
+			contentType = "image/png"
+		} else if strings.HasSuffix(iconURL, ".ico") {
+			contentType = "image/x-icon"
+		} else {
+			contentType = "image/png"
+		}
+	}
+
+	return data, contentType, nil
 }