crawler/item.go

package main

import (
	"context"
	"encoding/json"
	"time"

	"github.com/jackc/pgx/v5"
)

// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
	URL    string `json:"url"`
	Type   string `json:"type"`   // MIME type (audio/mpeg, image/jpeg, etc.)
	Length int64  `json:"length"` // Size in bytes
}

// Item represents an individual entry/article from a feed
type Item struct {
	ID           int64     `json:"id,omitempty"`
	FeedURL      string    `json:"feed_url"`
	GUID         string    `json:"guid,omitempty"`
	Title        string    `json:"title,omitempty"`
	Link         string    `json:"link,omitempty"`
	Description  string    `json:"description,omitempty"`
	Content      string    `json:"content,omitempty"`
	Author       string    `json:"author,omitempty"`
	PubDate      time.Time `json:"pub_date,omitempty"`
	DiscoveredAt time.Time `json:"discovered_at"`
	UpdatedAt    time.Time `json:"updated_at,omitempty"`

	// Media attachments
	Enclosure *Enclosure `json:"enclosure,omitempty"`  // Primary enclosure (podcast audio, etc.)
	ImageURLs []string   `json:"image_urls,omitempty"` // Image URLs extracted from content
	Tags      []string   `json:"tags,omitempty"`       // Category/tag strings from feed

	// Publishing to PDS
	PublishedAt  time.Time `json:"published_at,omitempty"`
	PublishedUri string    `json:"published_uri,omitempty"`
}

// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
func (c *Crawler) saveItem(item *Item) error {
	// Serialize enclosure fields
	var enclosureUrl, enclosureType *string
	var enclosureLength *int64
	if item.Enclosure != nil {
		enclosureUrl = NullableString(item.Enclosure.URL)
		enclosureType = NullableString(item.Enclosure.Type)
		if item.Enclosure.Length > 0 {
			enclosureLength = &item.Enclosure.Length
		}
	}

	// Serialize imageUrls as JSON
	var imageUrlsJSON *string
	if len(item.ImageURLs) > 0 {
		if data, err := json.Marshal(item.ImageURLs); err == nil {
			s := string(data)
			imageUrlsJSON = &s
		}
	}

	// Serialize tags as JSON
	var tagsJSON *string
	if len(item.Tags) > 0 {
		if data, err := json.Marshal(item.Tags); err == nil {
			s := string(data)
			tagsJSON = &s
		}
	}

	_, err := c.db.Exec(`
		INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
			enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
		ON CONFLICT(feed_url, guid) DO UPDATE SET
			title = EXCLUDED.title,
			link = EXCLUDED.link,
			description = EXCLUDED.description,
			content = EXCLUDED.content,
			author = EXCLUDED.author,
			pub_date = EXCLUDED.pub_date,
			updated_at = EXCLUDED.updated_at,
			enclosure_url = EXCLUDED.enclosure_url,
			enclosure_type = EXCLUDED.enclosure_type,
			enclosure_length = EXCLUDED.enclosure_length,
			image_urls = EXCLUDED.image_urls,
			tags = EXCLUDED.tags
	`,
		item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
		NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
		NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
		enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
	)
	return err
}

// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
	if len(items) == 0 {
		return nil
	}

	tx, err := c.db.Begin()
	if err != nil {
		return err
	}
	defer tx.Rollback(context.Background())

	for _, item := range items {
		if item == nil || item.GUID == "" {
			continue // Skip nil items or items without GUID
		}

		// Serialize enclosure fields
		var enclosureUrl, enclosureType *string
		var enclosureLength *int64
		if item.Enclosure != nil {
			enclosureUrl = NullableString(item.Enclosure.URL)
			enclosureType = NullableString(item.Enclosure.Type)
			if item.Enclosure.Length > 0 {
				enclosureLength = &item.Enclosure.Length
			}
		}

		// Serialize imageUrls as JSON
		var imageUrlsJSON *string
		if len(item.ImageURLs) > 0 {
			if data, err := json.Marshal(item.ImageURLs); err == nil {
				s := string(data)
				imageUrlsJSON = &s
			}
		}

		// Serialize tags as JSON
		var tagsJSON *string
		if len(item.Tags) > 0 {
			if data, err := json.Marshal(item.Tags); err == nil {
				s := string(data)
				tagsJSON = &s
			}
		}

		_, err := tx.Exec(context.Background(), `
			INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
				enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
			VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
			ON CONFLICT(feed_url, guid) DO UPDATE SET
				title = EXCLUDED.title,
				link = EXCLUDED.link,
				description = EXCLUDED.description,
				content = EXCLUDED.content,
				author = EXCLUDED.author,
				pub_date = EXCLUDED.pub_date,
				updated_at = EXCLUDED.updated_at,
				enclosure_url = EXCLUDED.enclosure_url,
				enclosure_type = EXCLUDED.enclosure_type,
				enclosure_length = EXCLUDED.enclosure_length,
				image_urls = EXCLUDED.image_urls,
				tags = EXCLUDED.tags
		`,
			item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
			NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
			NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
			enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
		)
		if err != nil {
			continue // Skip failed items
		}
	}

	return tx.Commit(context.Background())
}

// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
	rows, err := c.db.Query(`
		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
			published_at, published_uri
		FROM items
		WHERE feed_url = $1
		ORDER BY pub_date DESC
		LIMIT $2
	`, feedURL, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanItems(rows)
}

// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
	tsquery := ToSearchQuery(query)
	rows, err := c.db.Query(`
		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
			published_at, published_uri
		FROM items
		WHERE search_vector @@ to_tsquery('english', $1)
		ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
		LIMIT $2
	`, tsquery, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanItems(rows)
}

// scanItems is a helper to scan multiple item rows
func scanItems(rows pgx.Rows) ([]*Item, error) {
	var items []*Item
	for rows.Next() {
		item := &Item{}
		var guid, title, link, description, content, author *string
		var pubDate, updatedAt, publishedAt *time.Time
		var enclosureUrl, enclosureType *string
		var enclosureLength *int64
		var imageUrlsJSON, tagsJSON *string
		var publishedUri *string

		if err := rows.Scan(
			&item.ID, &item.FeedURL, &guid, &title, &link,
			&description, &content, &author, &pubDate,
			&item.DiscoveredAt, &updatedAt,
			&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
			&publishedAt, &publishedUri,
		); err != nil {
			continue
		}

		item.GUID = StringValue(guid)
		item.Title = StringValue(title)
		item.Link = StringValue(link)
		item.Description = StringValue(description)
		item.Content = StringValue(content)
		item.Author = StringValue(author)
		item.PubDate = TimeValue(pubDate)
		item.UpdatedAt = TimeValue(updatedAt)

		// Parse enclosure
		if enclosureUrl != nil && *enclosureUrl != "" {
			item.Enclosure = &Enclosure{
				URL:  *enclosureUrl,
				Type: StringValue(enclosureType),
			}
			if enclosureLength != nil {
				item.Enclosure.Length = *enclosureLength
			}
		}

		// Parse imageUrls JSON
		if imageUrlsJSON != nil && *imageUrlsJSON != "" {
			var urls []string
			if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
				item.ImageURLs = urls
			}
		}

		// Parse tags JSON
		if tagsJSON != nil && *tagsJSON != "" {
			var tags []string
			if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
				item.Tags = tags
			}
		}

		item.PublishedAt = TimeValue(publishedAt)
		item.PublishedUri = StringValue(publishedUri)

		items = append(items, item)
	}

	return items, rows.Err()
}

// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
	cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
	result, err := c.db.Exec(`
		DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
	`, cutoff)
	if err != nil {
		return 0, err
	}
	return result, nil
}

// GetUnpublishedItems returns items for a feed that haven't been published yet
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
	rows, err := c.db.Query(`
		SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
			enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
			published_at, published_uri
		FROM items
		WHERE feed_url = $1 AND published_at IS NULL
		ORDER BY pub_date ASC
		LIMIT $2
	`, feedURL, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	return scanItems(rows)
}

// MarkItemPublished marks an item as published with the given URI
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
	_, err := c.db.Exec(`
		UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
	`, uri, itemID)
	return err
}

// GetUnpublishedItemCount returns the count of unpublished items for a feed
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
	var count int
	err := c.db.QueryRow(`
		SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
	`, feedURL).Scan(&count)
	return count, err
}