package main import ( "context" "encoding/json" "time" "github.com/jackc/pgx/v5" ) // Enclosure represents a media attachment (audio, video, image) type Enclosure struct { URL string `json:"url"` Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.) Length int64 `json:"length"` // Size in bytes } // Item represents an individual entry/article from a feed type Item struct { FeedURL string `json:"feed_url"` GUID string `json:"guid,omitempty"` Title string `json:"title,omitempty"` Link string `json:"link,omitempty"` Description string `json:"description,omitempty"` Content string `json:"content,omitempty"` Author string `json:"author,omitempty"` PubDate time.Time `json:"pub_date,omitempty"` DiscoveredAt time.Time `json:"discovered_at"` UpdatedAt time.Time `json:"updated_at,omitempty"` // Media attachments Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.) ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content Tags []string `json:"tags,omitempty"` // Category/tag strings from feed // Publishing to PDS PublishedAt time.Time `json:"published_at,omitempty"` PublishedUri string `json:"published_uri,omitempty"` } // saveItem stores an item in PostgreSQL (upsert by feed_url + guid) func (c *Crawler) saveItem(item *Item) error { // Serialize enclosure fields var enclosureUrl, enclosureType *string var enclosureLength *int64 if item.Enclosure != nil { enclosureUrl = NullableString(item.Enclosure.URL) enclosureType = NullableString(item.Enclosure.Type) if item.Enclosure.Length > 0 { enclosureLength = &item.Enclosure.Length } } // Serialize imageUrls as JSON var imageUrlsJSON *string if len(item.ImageURLs) > 0 { if data, err := json.Marshal(item.ImageURLs); err == nil { s := string(data) imageUrlsJSON = &s } } // Serialize tags as JSON var tagsJSON *string if len(item.Tags) > 0 { if data, err := json.Marshal(item.Tags); err == nil { s := string(data) tagsJSON = &s } } _, err := c.db.Exec(` INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at, enclosure_url, enclosure_type, enclosure_length, image_urls, tags) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15) ON CONFLICT(feed_url, guid) DO UPDATE SET title = EXCLUDED.title, link = EXCLUDED.link, description = EXCLUDED.description, content = EXCLUDED.content, author = EXCLUDED.author, pub_date = EXCLUDED.pub_date, updated_at = EXCLUDED.updated_at, enclosure_url = EXCLUDED.enclosure_url, enclosure_type = EXCLUDED.enclosure_type, enclosure_length = EXCLUDED.enclosure_length, image_urls = EXCLUDED.image_urls, tags = EXCLUDED.tags `, item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link), NullableString(item.Description), NullableString(item.Content), NullableString(item.Author), NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt), enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON, ) return err } // saveItems stores multiple items efficiently func (c *Crawler) saveItems(items []*Item) error { if len(items) == 0 { return nil } tx, err := c.db.Begin() if err != nil { return err } defer tx.Rollback(context.Background()) for _, item := range items { if item == nil || item.GUID == "" { continue // Skip nil items or items without GUID } // Serialize enclosure fields var enclosureUrl, enclosureType *string var enclosureLength *int64 if item.Enclosure != nil { enclosureUrl = NullableString(item.Enclosure.URL) enclosureType = NullableString(item.Enclosure.Type) if item.Enclosure.Length > 0 { enclosureLength = &item.Enclosure.Length } } // Serialize imageUrls as JSON var imageUrlsJSON *string if len(item.ImageURLs) > 0 { if data, err := json.Marshal(item.ImageURLs); err == nil { s := string(data) imageUrlsJSON = &s } } // Serialize tags as JSON var tagsJSON *string if len(item.Tags) > 0 { if data, err := json.Marshal(item.Tags); err == nil { s := string(data) tagsJSON = &s } } _, err := tx.Exec(context.Background(), ` INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at, enclosure_url, enclosure_type, enclosure_length, image_urls, tags) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15) ON CONFLICT(feed_url, guid) DO UPDATE SET title = EXCLUDED.title, link = EXCLUDED.link, description = EXCLUDED.description, content = EXCLUDED.content, author = EXCLUDED.author, pub_date = EXCLUDED.pub_date, updated_at = EXCLUDED.updated_at, enclosure_url = EXCLUDED.enclosure_url, enclosure_type = EXCLUDED.enclosure_type, enclosure_length = EXCLUDED.enclosure_length, image_urls = EXCLUDED.image_urls, tags = EXCLUDED.tags `, item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link), NullableString(item.Description), NullableString(item.Content), NullableString(item.Author), NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt), enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON, ) if err != nil { continue // Skip failed items } } return tx.Commit(context.Background()) } // GetItemsByFeed returns all items for a specific feed func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) { rows, err := c.db.Query(` SELECT feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at, enclosure_url, enclosure_type, enclosure_length, image_urls, tags, published_at, published_uri FROM items WHERE feed_url = $1 ORDER BY pub_date DESC LIMIT $2 `, feedURL, limit) if err != nil { return nil, err } defer rows.Close() return scanItems(rows) } // SearchItems performs a full-text search on items func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) { tsquery := ToSearchQuery(query) rows, err := c.db.Query(` SELECT feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at, enclosure_url, enclosure_type, enclosure_length, image_urls, tags, published_at, published_uri FROM items WHERE search_vector @@ to_tsquery('english', $1) ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC LIMIT $2 `, tsquery, limit) if err != nil { return nil, err } defer rows.Close() return scanItems(rows) } // scanItems is a helper to scan multiple item rows func scanItems(rows pgx.Rows) ([]*Item, error) { var items []*Item for rows.Next() { item := &Item{} var guid, title, link, description, content, author *string var pubDate, updatedAt, publishedAt *time.Time var enclosureUrl, enclosureType *string var enclosureLength *int64 var imageUrlsJSON, tagsJSON *string var publishedUri *string if err := rows.Scan( &item.FeedURL, &guid, &title, &link, &description, &content, &author, &pubDate, &item.DiscoveredAt, &updatedAt, &enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON, &publishedAt, &publishedUri, ); err != nil { continue } item.GUID = StringValue(guid) item.Title = StringValue(title) item.Link = StringValue(link) item.Description = StringValue(description) item.Content = StringValue(content) item.Author = StringValue(author) item.PubDate = TimeValue(pubDate) item.UpdatedAt = TimeValue(updatedAt) // Parse enclosure if enclosureUrl != nil && *enclosureUrl != "" { item.Enclosure = &Enclosure{ URL: *enclosureUrl, Type: StringValue(enclosureType), } if enclosureLength != nil { item.Enclosure.Length = *enclosureLength } } // Parse imageUrls JSON if imageUrlsJSON != nil && *imageUrlsJSON != "" { var urls []string if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil { item.ImageURLs = urls } } // Parse tags JSON if tagsJSON != nil && *tagsJSON != "" { var tags []string if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil { item.Tags = tags } } item.PublishedAt = TimeValue(publishedAt) item.PublishedUri = StringValue(publishedUri) items = append(items, item) } return items, rows.Err() } // CleanupOldItems removes items older than 12 months func (c *Crawler) CleanupOldItems() (int64, error) { cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago result, err := c.db.Exec(` DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL `, cutoff) if err != nil { return 0, err } return result, nil } // GetUnpublishedItems returns items for a feed that haven't been published yet func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) { rows, err := c.db.Query(` SELECT feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at, enclosure_url, enclosure_type, enclosure_length, image_urls, tags, published_at, published_uri FROM items WHERE feed_url = $1 AND published_at IS NULL ORDER BY pub_date ASC LIMIT $2 `, feedURL, limit) if err != nil { return nil, err } defer rows.Close() return scanItems(rows) } // MarkItemPublished marks an item as published with the given URI func (c *Crawler) MarkItemPublished(feedURL, guid, uri string) error { _, err := c.db.Exec(` UPDATE items SET published_at = NOW(), published_uri = $1 WHERE feed_url = $2 AND guid = $3 `, uri, feedURL, guid) return err } // GetUnpublishedItemCount returns the count of unpublished items for a feed func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) { var count int err := c.db.QueryRow(` SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL `, feedURL).Scan(&count) return count, err }