Migrate to normalized FK schema (domain_host, domain_tld)

Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-02-01 22:36:25 -05:00
parent e7f6be2203
commit 7ec4207173
12 changed files with 193 additions and 214 deletions
+25 -5
View File
@@ -2,6 +2,7 @@ package main
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
@@ -42,16 +43,35 @@ func NewCrawler(connString string) (*Crawler, error) {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Custom transport with longer timeouts (HTTP/2 disabled for compatibility)
transport := &http.Transport{
TLSClientConfig: &tls.Config{
MinVersion: tls.VersionTLS12,
NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility
},
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
ForceAttemptHTTP2: false,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 30 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
ResponseHeaderTimeout: 60 * time.Second,
}
return &Crawler{
MaxDepth: 10,
MaxPagesPerHost: 10,
Timeout: 10 * time.Second,
UserAgent: "FeedCrawler/1.0",
Timeout: 60 * time.Second,
UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)",
startTime: time.Now(),
db: db,
shutdownCh: make(chan struct{}),
client: &http.Client{
Timeout: 10 * time.Second,
Timeout: 60 * time.Second,
Transport: transport,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("stopped after 10 redirects")
@@ -347,7 +367,7 @@ type FeedInfo struct {
func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
var title, description, siteURL, sourceHost *string
err := c.db.QueryRow(`
SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
SELECT title, description, site_url, domain_host || '.' || domain_tld as source_host FROM feeds WHERE url = $1
`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
if err != nil {
return nil
@@ -363,7 +383,7 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
// RefreshAllProfiles updates profiles for all existing accounts with feed URLs
func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
rows, err := c.db.Query(`
SELECT url, title, description, site_url, source_host, publish_account
SELECT url, title, description, site_url, domain_host || '.' || domain_tld as source_host, publish_account
FROM feeds
WHERE publish_account IS NOT NULL AND publish_account <> ''
`)