Files
crawler/item.go
primal 70828bf05d Remove unused enclosure_length from items table
The enclosure length was never used when publishing to the PDS.
Added migration to drop the column.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 22:56:45 -05:00

303 lines
8.3 KiB
Go

package main
import (
"context"
"encoding/json"
"time"
"github.com/jackc/pgx/v5"
)
// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
URL string `json:"url"`
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
}
// Item represents an individual entry/article from a feed
type Item struct {
FeedURL string `json:"feed_url"`
Link string `json:"link"` // Primary key (with feed_url)
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from description
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
PublishedUri string `json:"published_uri,omitempty"`
}
// saveItem stores an item in PostgreSQL (upsert by link + feed_url)
func (c *Crawler) saveItem(item *Item) error {
// Skip items without a link (can't be published)
if item.Link == "" {
return nil
}
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := c.db.Exec(`
INSERT INTO items (link, feed_url, title, description, author, pub_date,
enclosure_url, enclosure_type, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT(link, feed_url) DO UPDATE SET
title = EXCLUDED.title,
description = EXCLUDED.description,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.Link, item.FeedURL, NullableString(item.Title),
NullableString(item.Description), NullableString(item.Author),
NullableTime(item.PubDate),
enclosureUrl, enclosureType, imageUrlsJSON, tagsJSON,
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback(context.Background())
for _, item := range items {
if item == nil || item.Link == "" {
continue // Skip nil items or items without link
}
// Serialize enclosure fields
var enclosureUrl, enclosureType *string
if item.Enclosure != nil {
enclosureUrl = NullableString(item.Enclosure.URL)
enclosureType = NullableString(item.Enclosure.Type)
}
// Serialize imageUrls as JSON
var imageUrlsJSON *string
if len(item.ImageURLs) > 0 {
if data, err := json.Marshal(item.ImageURLs); err == nil {
s := string(data)
imageUrlsJSON = &s
}
}
// Serialize tags as JSON
var tagsJSON *string
if len(item.Tags) > 0 {
if data, err := json.Marshal(item.Tags); err == nil {
s := string(data)
tagsJSON = &s
}
}
_, err := tx.Exec(context.Background(), `
INSERT INTO items (link, feed_url, title, description, author, pub_date,
enclosure_url, enclosure_type, image_urls, tags)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT(link, feed_url) DO UPDATE SET
title = EXCLUDED.title,
description = EXCLUDED.description,
author = EXCLUDED.author,
pub_date = EXCLUDED.pub_date,
enclosure_url = EXCLUDED.enclosure_url,
enclosure_type = EXCLUDED.enclosure_type,
image_urls = EXCLUDED.image_urls,
tags = EXCLUDED.tags
`,
item.Link, item.FeedURL, NullableString(item.Title),
NullableString(item.Description), NullableString(item.Author),
NullableTime(item.PubDate),
enclosureUrl, enclosureType, imageUrlsJSON, tagsJSON,
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit(context.Background())
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT link, feed_url, title, description, author, pub_date,
enclosure_url, enclosure_type, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1
ORDER BY pub_date DESC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
tsquery := ToSearchQuery(query)
rows, err := c.db.Query(`
SELECT link, feed_url, title, description, author, pub_date,
enclosure_url, enclosure_type, image_urls, tags,
published_at, published_uri
FROM items
WHERE search_vector @@ to_tsquery('english', $1)
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
LIMIT $2
`, tsquery, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// scanItems is a helper to scan multiple item rows
func scanItems(rows pgx.Rows) ([]*Item, error) {
var items []*Item
for rows.Next() {
item := &Item{}
var title, description, author *string
var pubDate, publishedAt *time.Time
var enclosureUrl, enclosureType *string
var imageUrlsJSON, tagsJSON *string
var publishedUri *string
if err := rows.Scan(
&item.Link, &item.FeedURL, &title,
&description, &author, &pubDate,
&enclosureUrl, &enclosureType, &imageUrlsJSON, &tagsJSON,
&publishedAt, &publishedUri,
); err != nil {
continue
}
item.Title = StringValue(title)
item.Description = StringValue(description)
item.Author = StringValue(author)
item.PubDate = TimeValue(pubDate)
// Parse enclosure
if enclosureUrl != nil && *enclosureUrl != "" {
item.Enclosure = &Enclosure{
URL: *enclosureUrl,
Type: StringValue(enclosureType),
}
}
// Parse imageUrls JSON
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
var urls []string
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
item.ImageURLs = urls
}
}
// Parse tags JSON
if tagsJSON != nil && *tagsJSON != "" {
var tags []string
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
item.Tags = tags
}
}
item.PublishedAt = TimeValue(publishedAt)
item.PublishedUri = StringValue(publishedUri)
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
`, cutoff)
if err != nil {
return 0, err
}
return result, nil
}
// GetUnpublishedItems returns items for a feed that haven't been published yet
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT link, feed_url, title, description, author, pub_date,
enclosure_url, enclosure_type, image_urls, tags,
published_at, published_uri
FROM items
WHERE feed_url = $1 AND published_at IS NULL
ORDER BY pub_date ASC
LIMIT $2
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanItems(rows)
}
// MarkItemPublished marks an item as published with the given URI
func (c *Crawler) MarkItemPublished(feedURL, link, uri string) error {
_, err := c.db.Exec(`
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE feed_url = $2 AND link = $3
`, uri, feedURL, link)
return err
}
// GetUnpublishedItemCount returns the count of unpublished items for a feed
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
var count int
err := c.db.QueryRow(`
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
`, feedURL).Scan(&count)
return count, err
}