Migrate to normalized FK schema (domain_host, domain_tld)
Replace source_host column with proper FK to domains table using composite key (domain_host, domain_tld). This enables JOIN queries instead of string concatenation for domain lookups. Changes: - Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD - Update all SQL queries to use domain_host/domain_tld columns - Add column aliases (as source_host) for API backwards compatibility - Update trigram index from source_host to domain_host - Add getDomainHost() helper for extracting host from domain Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+25
-5
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -42,16 +43,35 @@ func NewCrawler(connString string) (*Crawler, error) {
|
||||
return nil, fmt.Errorf("failed to open database: %v", err)
|
||||
}
|
||||
|
||||
// Custom transport with longer timeouts (HTTP/2 disabled for compatibility)
|
||||
transport := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
MinVersion: tls.VersionTLS12,
|
||||
NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility
|
||||
},
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 30 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).DialContext,
|
||||
ForceAttemptHTTP2: false,
|
||||
MaxIdleConns: 100,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
TLSHandshakeTimeout: 30 * time.Second,
|
||||
ExpectContinueTimeout: 1 * time.Second,
|
||||
ResponseHeaderTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
return &Crawler{
|
||||
MaxDepth: 10,
|
||||
MaxPagesPerHost: 10,
|
||||
Timeout: 10 * time.Second,
|
||||
UserAgent: "FeedCrawler/1.0",
|
||||
Timeout: 60 * time.Second,
|
||||
UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)",
|
||||
startTime: time.Now(),
|
||||
db: db,
|
||||
shutdownCh: make(chan struct{}),
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
Timeout: 60 * time.Second,
|
||||
Transport: transport,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("stopped after 10 redirects")
|
||||
@@ -347,7 +367,7 @@ type FeedInfo struct {
|
||||
func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
|
||||
var title, description, siteURL, sourceHost *string
|
||||
err := c.db.QueryRow(`
|
||||
SELECT title, description, site_url, source_host FROM feeds WHERE url = $1
|
||||
SELECT title, description, site_url, domain_host || '.' || domain_tld as source_host FROM feeds WHERE url = $1
|
||||
`, feedURL).Scan(&title, &description, &siteURL, &sourceHost)
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -363,7 +383,7 @@ func (c *Crawler) getFeedInfo(feedURL string) *FeedInfo {
|
||||
// RefreshAllProfiles updates profiles for all existing accounts with feed URLs
|
||||
func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, title, description, site_url, source_host, publish_account
|
||||
SELECT url, title, description, site_url, domain_host || '.' || domain_tld as source_host, publish_account
|
||||
FROM feeds
|
||||
WHERE publish_account IS NOT NULL AND publish_account <> ''
|
||||
`)
|
||||
|
||||
Reference in New Issue
Block a user