Migrate to normalized FK schema (domain_host, domain_tld)

Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-02-01 22:36:25 -05:00
parent e7f6be2203
commit 7ec4207173
12 changed files with 193 additions and 214 deletions
+22 -22
View File
@@ -116,8 +116,8 @@ type Feed struct {
// Discovery source
SourceURL string `json:"source_url,omitempty"`
SourceHost string `json:"source_host,omitempty"`
TLD string `json:"tld,omitempty"`
DomainHost string `json:"domain_host,omitempty"`
DomainTLD string `json:"domain_tld,omitempty"`
// Content stats
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
@@ -139,7 +139,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
// Auto-pass feeds from our own domain
publishStatus := feed.PublishStatus
if publishStatus == "" {
if strings.HasSuffix(feed.SourceHost, "1440.news") || feed.SourceHost == "1440.news" {
if strings.HasSuffix(feed.DomainHost, "1440.news") || feed.DomainHost == "1440.news" {
publishStatus = "pass"
} else if feed.Language == "" || (feed.Language != "en" && !strings.HasPrefix(feed.Language, "en-")) {
publishStatus = "skip"
@@ -156,7 +156,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -188,7 +188,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
feed.DiscoveredAt, NullableTime(feed.LastCheckedAt), NullableTime(feed.NextCheckAt), NullableTime(feed.LastBuildDate),
NullableString(feed.ETag), NullableString(feed.LastModified),
feed.Status, NullableString(feed.LastError), NullableTime(feed.LastErrorAt),
NullableString(feed.SourceURL), NullableString(feed.SourceHost), NullableString(feed.TLD),
NullableString(feed.SourceURL), NullableString(feed.DomainHost), NullableString(feed.DomainTLD),
feed.ItemCount, NullableTime(feed.OldestItemDate), NullableTime(feed.NewestItemDate),
feed.NoUpdate,
publishStatus, NullableString(feed.PublishAccount),
@@ -201,7 +201,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed := &Feed{}
var category, title, description, language, siteURL *string
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
var publishStatus, publishAccount *string
var itemCount, noUpdate *int
@@ -210,7 +210,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -220,7 +220,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
&etag, &lastModified,
&feed.Status, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&sourceURL, &domainHost, &domainTLD,
&itemCount, &oldestItemDate, &newestItemDate,
&noUpdate,
&publishStatus, &publishAccount,
@@ -251,8 +251,8 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
feed.LastError = StringValue(lastError)
feed.LastErrorAt = TimeValue(lastErrorAt)
feed.SourceURL = StringValue(sourceURL)
feed.SourceHost = StringValue(sourceHost)
feed.TLD = StringValue(tld)
feed.DomainHost = StringValue(domainHost)
feed.DomainTLD = StringValue(domainTLD)
if itemCount != nil {
feed.ItemCount = *itemCount
}
@@ -285,7 +285,7 @@ func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -309,7 +309,7 @@ func (c *Crawler) GetFeedCount() (int, error) {
// GetFeedCountByHost returns the number of feeds for a specific host
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE source_host = $1", host).Scan(&count)
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE domain_host = $1", host).Scan(&count)
return count, err
}
@@ -320,7 +320,7 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -344,11 +344,11 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
FROM feeds WHERE source_host = $1
FROM feeds WHERE domain_host = $1
`, host)
if err != nil {
return nil, err
@@ -366,7 +366,7 @@ func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -390,7 +390,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
feed := &Feed{}
var feedType, category, title, description, language, siteURL *string
var lastCheckedAt, nextCheckAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate *time.Time
var etag, lastModified, lastError, sourceURL, sourceHost, tld *string
var etag, lastModified, lastError, sourceURL, domainHost, domainTLD *string
var itemCount, noUpdate *int
var status *string
var publishStatus, publishAccount *string
@@ -400,7 +400,7 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
&feed.DiscoveredAt, &lastCheckedAt, &nextCheckAt, &lastBuildDate,
&etag, &lastModified,
&status, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&sourceURL, &domainHost, &domainTLD,
&itemCount, &oldestItemDate, &newestItemDate,
&noUpdate,
&publishStatus, &publishAccount,
@@ -428,8 +428,8 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
feed.LastError = StringValue(lastError)
feed.LastErrorAt = TimeValue(lastErrorAt)
feed.SourceURL = StringValue(sourceURL)
feed.SourceHost = StringValue(sourceHost)
feed.TLD = StringValue(tld)
feed.DomainHost = StringValue(domainHost)
feed.DomainTLD = StringValue(domainTLD)
if itemCount != nil {
feed.ItemCount = *itemCount
}
@@ -474,7 +474,7 @@ func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account
@@ -496,7 +496,7 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
discovered_at, last_checked_at, next_check_at, last_build_date,
etag, last_modified,
status, last_error, last_error_at,
source_url, source_host, tld,
source_url, domain_host, domain_tld,
item_count, oldest_item_date, newest_item_date,
no_update,
publish_status, publish_account