Split dashboard.go (3,528 lines) into: - routes.go: HTTP route registration - api_domains.go: Domain API handlers - api_feeds.go: Feed API handlers - api_publish.go: Publishing API handlers - api_search.go: Search API handlers - templates.go: HTML templates - dashboard.go: Stats functions only (235 lines) Split publisher.go (1,502 lines) into: - pds_auth.go: Authentication and account management - pds_records.go: Record operations (upload, update, delete) - handle.go: Handle derivation from feed URLs - image.go: Image processing and favicon fetching - publisher.go: Core types and PublishItem (439 lines) Split feed.go (1,137 lines) into: - item.go: Item struct and DB operations - feed_check.go: Feed checking and processing - feed.go: Feed struct and DB operations (565 lines) Also includes domain import batch size increase (1k -> 100k). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
329 lines
9.9 KiB
Go
329 lines
9.9 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
// Enclosure represents a media attachment (audio, video, image)
|
|
type Enclosure struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
|
Length int64 `json:"length"` // Size in bytes
|
|
}
|
|
|
|
// Item represents an individual entry/article from a feed
|
|
type Item struct {
|
|
ID int64 `json:"id,omitempty"`
|
|
FeedURL string `json:"feed_url"`
|
|
GUID string `json:"guid,omitempty"`
|
|
Title string `json:"title,omitempty"`
|
|
Link string `json:"link,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Content string `json:"content,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
PubDate time.Time `json:"pub_date,omitempty"`
|
|
DiscoveredAt time.Time `json:"discovered_at"`
|
|
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
|
|
|
// Media attachments
|
|
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
|
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
|
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
|
|
|
|
// Publishing to PDS
|
|
PublishedAt time.Time `json:"published_at,omitempty"`
|
|
PublishedUri string `json:"published_uri,omitempty"`
|
|
}
|
|
|
|
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
|
|
func (c *Crawler) saveItem(item *Item) error {
|
|
// Serialize enclosure fields
|
|
var enclosureUrl, enclosureType *string
|
|
var enclosureLength *int64
|
|
if item.Enclosure != nil {
|
|
enclosureUrl = NullableString(item.Enclosure.URL)
|
|
enclosureType = NullableString(item.Enclosure.Type)
|
|
if item.Enclosure.Length > 0 {
|
|
enclosureLength = &item.Enclosure.Length
|
|
}
|
|
}
|
|
|
|
// Serialize imageUrls as JSON
|
|
var imageUrlsJSON *string
|
|
if len(item.ImageURLs) > 0 {
|
|
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
|
s := string(data)
|
|
imageUrlsJSON = &s
|
|
}
|
|
}
|
|
|
|
// Serialize tags as JSON
|
|
var tagsJSON *string
|
|
if len(item.Tags) > 0 {
|
|
if data, err := json.Marshal(item.Tags); err == nil {
|
|
s := string(data)
|
|
tagsJSON = &s
|
|
}
|
|
}
|
|
|
|
_, err := c.db.Exec(`
|
|
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
|
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
|
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
|
title = EXCLUDED.title,
|
|
link = EXCLUDED.link,
|
|
description = EXCLUDED.description,
|
|
content = EXCLUDED.content,
|
|
author = EXCLUDED.author,
|
|
pub_date = EXCLUDED.pub_date,
|
|
updated_at = EXCLUDED.updated_at,
|
|
enclosure_url = EXCLUDED.enclosure_url,
|
|
enclosure_type = EXCLUDED.enclosure_type,
|
|
enclosure_length = EXCLUDED.enclosure_length,
|
|
image_urls = EXCLUDED.image_urls,
|
|
tags = EXCLUDED.tags
|
|
`,
|
|
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
|
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
|
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
|
)
|
|
return err
|
|
}
|
|
|
|
// saveItems stores multiple items efficiently
|
|
func (c *Crawler) saveItems(items []*Item) error {
|
|
if len(items) == 0 {
|
|
return nil
|
|
}
|
|
|
|
tx, err := c.db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer tx.Rollback(context.Background())
|
|
|
|
for _, item := range items {
|
|
if item == nil || item.GUID == "" {
|
|
continue // Skip nil items or items without GUID
|
|
}
|
|
|
|
// Serialize enclosure fields
|
|
var enclosureUrl, enclosureType *string
|
|
var enclosureLength *int64
|
|
if item.Enclosure != nil {
|
|
enclosureUrl = NullableString(item.Enclosure.URL)
|
|
enclosureType = NullableString(item.Enclosure.Type)
|
|
if item.Enclosure.Length > 0 {
|
|
enclosureLength = &item.Enclosure.Length
|
|
}
|
|
}
|
|
|
|
// Serialize imageUrls as JSON
|
|
var imageUrlsJSON *string
|
|
if len(item.ImageURLs) > 0 {
|
|
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
|
s := string(data)
|
|
imageUrlsJSON = &s
|
|
}
|
|
}
|
|
|
|
// Serialize tags as JSON
|
|
var tagsJSON *string
|
|
if len(item.Tags) > 0 {
|
|
if data, err := json.Marshal(item.Tags); err == nil {
|
|
s := string(data)
|
|
tagsJSON = &s
|
|
}
|
|
}
|
|
|
|
_, err := tx.Exec(context.Background(), `
|
|
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
|
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
|
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
|
title = EXCLUDED.title,
|
|
link = EXCLUDED.link,
|
|
description = EXCLUDED.description,
|
|
content = EXCLUDED.content,
|
|
author = EXCLUDED.author,
|
|
pub_date = EXCLUDED.pub_date,
|
|
updated_at = EXCLUDED.updated_at,
|
|
enclosure_url = EXCLUDED.enclosure_url,
|
|
enclosure_type = EXCLUDED.enclosure_type,
|
|
enclosure_length = EXCLUDED.enclosure_length,
|
|
image_urls = EXCLUDED.image_urls,
|
|
tags = EXCLUDED.tags
|
|
`,
|
|
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
|
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
|
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
|
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
|
)
|
|
if err != nil {
|
|
continue // Skip failed items
|
|
}
|
|
}
|
|
|
|
return tx.Commit(context.Background())
|
|
}
|
|
|
|
// GetItemsByFeed returns all items for a specific feed
|
|
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
|
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
|
published_at, published_uri
|
|
FROM items
|
|
WHERE feed_url = $1
|
|
ORDER BY pub_date DESC
|
|
LIMIT $2
|
|
`, feedURL, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// SearchItems performs a full-text search on items
|
|
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
|
tsquery := ToSearchQuery(query)
|
|
rows, err := c.db.Query(`
|
|
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
|
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
|
published_at, published_uri
|
|
FROM items
|
|
WHERE search_vector @@ to_tsquery('english', $1)
|
|
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
|
|
LIMIT $2
|
|
`, tsquery, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// scanItems is a helper to scan multiple item rows
|
|
func scanItems(rows pgx.Rows) ([]*Item, error) {
|
|
var items []*Item
|
|
for rows.Next() {
|
|
item := &Item{}
|
|
var guid, title, link, description, content, author *string
|
|
var pubDate, updatedAt, publishedAt *time.Time
|
|
var enclosureUrl, enclosureType *string
|
|
var enclosureLength *int64
|
|
var imageUrlsJSON, tagsJSON *string
|
|
var publishedUri *string
|
|
|
|
if err := rows.Scan(
|
|
&item.ID, &item.FeedURL, &guid, &title, &link,
|
|
&description, &content, &author, &pubDate,
|
|
&item.DiscoveredAt, &updatedAt,
|
|
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
|
|
&publishedAt, &publishedUri,
|
|
); err != nil {
|
|
continue
|
|
}
|
|
|
|
item.GUID = StringValue(guid)
|
|
item.Title = StringValue(title)
|
|
item.Link = StringValue(link)
|
|
item.Description = StringValue(description)
|
|
item.Content = StringValue(content)
|
|
item.Author = StringValue(author)
|
|
item.PubDate = TimeValue(pubDate)
|
|
item.UpdatedAt = TimeValue(updatedAt)
|
|
|
|
// Parse enclosure
|
|
if enclosureUrl != nil && *enclosureUrl != "" {
|
|
item.Enclosure = &Enclosure{
|
|
URL: *enclosureUrl,
|
|
Type: StringValue(enclosureType),
|
|
}
|
|
if enclosureLength != nil {
|
|
item.Enclosure.Length = *enclosureLength
|
|
}
|
|
}
|
|
|
|
// Parse imageUrls JSON
|
|
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
|
|
var urls []string
|
|
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
|
|
item.ImageURLs = urls
|
|
}
|
|
}
|
|
|
|
// Parse tags JSON
|
|
if tagsJSON != nil && *tagsJSON != "" {
|
|
var tags []string
|
|
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
|
|
item.Tags = tags
|
|
}
|
|
}
|
|
|
|
item.PublishedAt = TimeValue(publishedAt)
|
|
item.PublishedUri = StringValue(publishedUri)
|
|
|
|
items = append(items, item)
|
|
}
|
|
|
|
return items, rows.Err()
|
|
}
|
|
|
|
// CleanupOldItems removes items older than 12 months
|
|
func (c *Crawler) CleanupOldItems() (int64, error) {
|
|
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
|
result, err := c.db.Exec(`
|
|
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
|
|
`, cutoff)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
|
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
|
rows, err := c.db.Query(`
|
|
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
|
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
|
published_at, published_uri
|
|
FROM items
|
|
WHERE feed_url = $1 AND published_at IS NULL
|
|
ORDER BY pub_date ASC
|
|
LIMIT $2
|
|
`, feedURL, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanItems(rows)
|
|
}
|
|
|
|
// MarkItemPublished marks an item as published with the given URI
|
|
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
|
_, err := c.db.Exec(`
|
|
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
|
|
`, uri, itemID)
|
|
return err
|
|
}
|
|
|
|
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
|
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
|
var count int
|
|
err := c.db.QueryRow(`
|
|
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
|
|
`, feedURL).Scan(&count)
|
|
return count, err
|
|
}
|