Add AT Protocol publishing, media support, and SQLite stability
Publishing: - Add publisher.go for posting feed items to AT Protocol PDS - Support deterministic rkeys from SHA256(guid + discoveredAt) - Handle multiple URLs in posts with facets for each link - Image embed support (app.bsky.embed.images) for up to 4 images - External embed with thumbnail fallback - Podcast/audio enclosure URLs included in post text Media extraction: - Parse RSS enclosures (audio, video, images) - Extract Media RSS content and thumbnails - Extract images from HTML content in descriptions - Store enclosure and imageUrls in items table SQLite stability improvements: - Add synchronous=NORMAL and wal_autocheckpoint pragmas - Connection pool tuning (idle conns, max lifetime) - Periodic WAL checkpoint every 5 minutes - Hourly integrity checks with PRAGMA quick_check - Daily hot backup via VACUUM INTO - Docker stop_grace_period: 30s for graceful shutdown Dashboard: - Feed publishing UI and API endpoints - Account creation with invite codes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -12,58 +13,91 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// shouldSkipFeed checks if a feed URL should be filtered out
|
||||
// Returns true (and a reason) if the feed should be skipped
|
||||
func shouldSkipFeed(feedURL string) (bool, string) {
|
||||
// classifyFeed determines the category of a feed based on URL patterns
|
||||
// Returns: "main", "comments", "category", "author", "article", "podcast"
|
||||
// Note: podcast detection is also done in parseRSSMetadata based on content
|
||||
func classifyFeed(feedURL string) string {
|
||||
lower := strings.ToLower(feedURL)
|
||||
|
||||
// Skip explicit comment feeds
|
||||
// Comment feeds
|
||||
if strings.Contains(lower, "/comment") {
|
||||
return true, "comment feed"
|
||||
return "comments"
|
||||
}
|
||||
|
||||
// Podcast URL patterns
|
||||
podcastPatterns := []string{"/podcast", "/podcasts", "/episode", "/episodes", "/show/", "/shows/"}
|
||||
for _, pattern := range podcastPatterns {
|
||||
if strings.Contains(lower, pattern) {
|
||||
return "podcast"
|
||||
}
|
||||
}
|
||||
|
||||
u, err := url.Parse(feedURL)
|
||||
if err != nil {
|
||||
return false, ""
|
||||
return "main"
|
||||
}
|
||||
|
||||
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
||||
|
||||
// Skip category/tag feeds
|
||||
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
|
||||
// Author feeds
|
||||
if strings.Contains(path, "/author/") {
|
||||
return "author"
|
||||
}
|
||||
|
||||
// Category/tag feeds
|
||||
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/"}
|
||||
for _, pattern := range categoryPatterns {
|
||||
if strings.Contains(path, pattern) {
|
||||
return true, "category/tag feed"
|
||||
return "category"
|
||||
}
|
||||
}
|
||||
|
||||
// Check for article comment feeds (path ending in /feed with content before it)
|
||||
// Check for article feeds (path ending in /feed with content before it)
|
||||
if strings.HasSuffix(path, "/feed") {
|
||||
basePath := strings.TrimSuffix(path, "/feed")
|
||||
basePath = strings.Trim(basePath, "/")
|
||||
|
||||
if basePath == "" {
|
||||
return false, "" // Just /feed - legitimate main feed
|
||||
return "main" // Just /feed - main feed
|
||||
}
|
||||
|
||||
// Skip if path contains date patterns (likely article)
|
||||
// Article if path contains date patterns
|
||||
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
||||
return true, "article feed (date pattern)"
|
||||
return "article"
|
||||
}
|
||||
|
||||
// Skip if path has multiple segments (likely article or nested content)
|
||||
// Article if path has multiple segments (nested content)
|
||||
segments := strings.Split(basePath, "/")
|
||||
if len(segments) >= 2 {
|
||||
return true, "article feed (nested path)"
|
||||
return "article"
|
||||
}
|
||||
|
||||
// Skip if single segment looks like an article slug (contains hyphens, is long)
|
||||
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
|
||||
return true, "article feed (slug pattern)"
|
||||
// Article if single segment looks like an article slug
|
||||
if len(segments) == 1 && strings.Contains(segments[0], "-") && len(segments[0]) > 20 {
|
||||
return "article"
|
||||
}
|
||||
}
|
||||
|
||||
return false, ""
|
||||
return "main"
|
||||
}
|
||||
|
||||
// classifyFeedByTitle refines category based on feed title (called after parsing)
|
||||
func classifyFeedByTitle(title string, currentCategory string) string {
|
||||
if currentCategory != "main" {
|
||||
return currentCategory // Already classified by URL
|
||||
}
|
||||
lower := strings.ToLower(title)
|
||||
if strings.HasPrefix(lower, "comments on:") || strings.HasPrefix(lower, "comments for:") {
|
||||
return "comments"
|
||||
}
|
||||
return currentCategory
|
||||
}
|
||||
|
||||
// Enclosure represents a media attachment (audio, video, image)
|
||||
type Enclosure struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||
Length int64 `json:"length"` // Size in bytes
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
@@ -79,12 +113,21 @@ type Item struct {
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
|
||||
// Media attachments
|
||||
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||
|
||||
// Publishing to PDS
|
||||
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||
PublishedUri string `json:"published_uri,omitempty"`
|
||||
}
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // "rss", "atom", or "unknown"
|
||||
Type string `json:"type"` // "rss", "atom", or "unknown"
|
||||
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
@@ -124,23 +167,35 @@ type Feed struct {
|
||||
|
||||
// Adaptive check interval
|
||||
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
||||
|
||||
// Publishing to PDS
|
||||
PublishStatus string `json:"publish_status"` // "held", "pass", "fail"
|
||||
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
|
||||
}
|
||||
|
||||
// saveFeed stores a feed in SQLite
|
||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
// Default publishStatus to "held" if not set
|
||||
publishStatus := feed.PublishStatus
|
||||
if publishStatus == "" {
|
||||
publishStatus = "held"
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO feeds (
|
||||
url, type, title, description, language, siteUrl,
|
||||
url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
type = excluded.type,
|
||||
category = excluded.category,
|
||||
title = excluded.title,
|
||||
description = excluded.description,
|
||||
language = excluded.language,
|
||||
@@ -161,9 +216,11 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
avgPostFreqHrs = excluded.avgPostFreqHrs,
|
||||
oldestItemDate = excluded.oldestItemDate,
|
||||
newestItemDate = excluded.newestItemDate,
|
||||
noUpdate = excluded.noUpdate
|
||||
noUpdate = excluded.noUpdate,
|
||||
publishStatus = excluded.publishStatus,
|
||||
publishAccount = excluded.publishAccount
|
||||
`,
|
||||
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
|
||||
feed.URL, feed.Type, feed.Category, nullString(feed.Title), nullString(feed.Description),
|
||||
nullString(feed.Language), nullString(feed.SiteURL),
|
||||
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
|
||||
nullString(feed.ETag), nullString(feed.LastModified),
|
||||
@@ -172,6 +229,7 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
|
||||
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
|
||||
feed.NoUpdate,
|
||||
publishStatus, nullString(feed.PublishAccount),
|
||||
)
|
||||
return err
|
||||
}
|
||||
@@ -179,23 +237,25 @@ func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
// getFeed retrieves a feed from SQLite
|
||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var category, title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
var publishStatus, publishAccount sql.NullString
|
||||
|
||||
err := c.db.QueryRow(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds WHERE url = ?
|
||||
`, normalizeURL(feedURL)).Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
@@ -203,6 +263,7 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
@@ -213,6 +274,11 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
}
|
||||
|
||||
// Handle nullable fields
|
||||
if category.Valid {
|
||||
feed.Category = category.String
|
||||
} else {
|
||||
feed.Category = "main" // Default
|
||||
}
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
@@ -267,6 +333,14 @@ func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
if publishStatus.Valid {
|
||||
feed.PublishStatus = publishStatus.String
|
||||
} else {
|
||||
feed.PublishStatus = "held"
|
||||
}
|
||||
if publishAccount.Valid {
|
||||
feed.PublishAccount = publishAccount.String
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
@@ -281,14 +355,15 @@ func (c *Crawler) feedExists(feedURL string) bool {
|
||||
// GetAllFeeds returns all feeds from the database
|
||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
`)
|
||||
if err != nil {
|
||||
@@ -316,14 +391,15 @@ func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
||||
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
|
||||
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
|
||||
ORDER BY RANDOM()
|
||||
@@ -340,14 +416,15 @@ func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
// GetFeedsByHost returns all feeds from a specific host
|
||||
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds WHERE sourceHost = ?
|
||||
`, host)
|
||||
if err != nil {
|
||||
@@ -361,14 +438,15 @@ func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
// SearchFeeds performs a full-text search on feeds
|
||||
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
|
||||
SELECT f.url, f.type, f.category, f.title, f.description, f.language, f.siteUrl,
|
||||
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
|
||||
f.etag, f.lastModified,
|
||||
f.ttlMinutes, f.updatePeriod, f.updateFreq,
|
||||
f.status, f.errorCount, f.lastError, f.lastErrorAt,
|
||||
f.sourceUrl, f.sourceHost, f.tld,
|
||||
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
|
||||
f.noUpdate
|
||||
f.noUpdate,
|
||||
f.publishEnabled, f.publishAccount
|
||||
FROM feeds f
|
||||
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
||||
WHERE feeds_fts MATCH ?
|
||||
@@ -387,13 +465,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
|
||||
for rows.Next() {
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var category, title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
var publishStatus, publishAccount sql.NullString
|
||||
|
||||
if err := rows.Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.URL, &feed.Type, &category, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
@@ -401,11 +480,17 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
&publishStatus, &publishAccount,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle nullable fields
|
||||
if category.Valid {
|
||||
feed.Category = category.String
|
||||
} else {
|
||||
feed.Category = "main"
|
||||
}
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
@@ -460,6 +545,14 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
if publishStatus.Valid {
|
||||
feed.PublishStatus = publishStatus.String
|
||||
} else {
|
||||
feed.PublishStatus = "held"
|
||||
}
|
||||
if publishAccount.Valid {
|
||||
feed.PublishAccount = publishAccount.String
|
||||
}
|
||||
|
||||
feeds = append(feeds, feed)
|
||||
}
|
||||
@@ -469,9 +562,27 @@ func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
|
||||
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
||||
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
||||
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON sql.NullString
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
||||
}
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
@@ -479,11 +590,16 @@ func (c *Crawler) saveItem(item *Item) error {
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
updatedAt = excluded.updatedAt,
|
||||
enclosureUrl = excluded.enclosureUrl,
|
||||
enclosureType = excluded.enclosureType,
|
||||
enclosureLength = excluded.enclosureLength,
|
||||
imageUrls = excluded.imageUrls
|
||||
`,
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
||||
)
|
||||
return err
|
||||
}
|
||||
@@ -501,8 +617,9 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.Prepare(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
@@ -510,7 +627,11 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
updatedAt = excluded.updatedAt,
|
||||
enclosureUrl = excluded.enclosureUrl,
|
||||
enclosureType = excluded.enclosureType,
|
||||
enclosureLength = excluded.enclosureLength,
|
||||
imageUrls = excluded.imageUrls
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -521,10 +642,29 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = sql.NullString{String: item.Enclosure.URL, Valid: item.Enclosure.URL != ""}
|
||||
enclosureType = sql.NullString{String: item.Enclosure.Type, Valid: item.Enclosure.Type != ""}
|
||||
enclosureLength = sql.NullInt64{Int64: item.Enclosure.Length, Valid: item.Enclosure.Length > 0}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON sql.NullString
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
imageUrlsJSON = sql.NullString{String: string(data), Valid: true}
|
||||
}
|
||||
}
|
||||
|
||||
_, err := stmt.Exec(
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON,
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
@@ -537,7 +677,9 @@ func (c *Crawler) saveItems(items []*Item) error {
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
||||
publishedAt, publishedUri
|
||||
FROM items
|
||||
WHERE feedUrl = ?
|
||||
ORDER BY pubDate DESC
|
||||
@@ -548,55 +690,15 @@ func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if guid.Valid {
|
||||
item.GUID = guid.String
|
||||
}
|
||||
if title.Valid {
|
||||
item.Title = title.String
|
||||
}
|
||||
if link.Valid {
|
||||
item.Link = link.String
|
||||
}
|
||||
if description.Valid {
|
||||
item.Description = description.String
|
||||
}
|
||||
if content.Valid {
|
||||
item.Content = content.String
|
||||
}
|
||||
if author.Valid {
|
||||
item.Author = author.String
|
||||
}
|
||||
if pubDate.Valid {
|
||||
item.PubDate = pubDate.Time
|
||||
}
|
||||
if updatedAt.Valid {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
|
||||
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt,
|
||||
i.enclosureUrl, i.enclosureType, i.enclosureLength, i.imageUrls,
|
||||
i.publishedAt, i.publishedUri
|
||||
FROM items i
|
||||
JOIN items_fts fts ON i.id = fts.rowid
|
||||
WHERE items_fts MATCH ?
|
||||
@@ -608,16 +710,27 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// scanItems is a helper to scan multiple item rows
|
||||
func scanItems(rows *sql.Rows) ([]*Item, error) {
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
var pubDate, updatedAt, publishedAt sql.NullTime
|
||||
var enclosureUrl, enclosureType sql.NullString
|
||||
var enclosureLength sql.NullInt64
|
||||
var imageUrlsJSON sql.NullString
|
||||
var publishedUri sql.NullString
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON,
|
||||
&publishedAt, &publishedUri,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
@@ -647,6 +760,32 @@ func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
// Parse enclosure
|
||||
if enclosureUrl.Valid && enclosureUrl.String != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: enclosureUrl.String,
|
||||
Type: enclosureType.String,
|
||||
}
|
||||
if enclosureLength.Valid {
|
||||
item.Enclosure.Length = enclosureLength.Int64
|
||||
}
|
||||
}
|
||||
|
||||
// Parse imageUrls JSON
|
||||
if imageUrlsJSON.Valid && imageUrlsJSON.String != "" {
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(imageUrlsJSON.String), &urls); err == nil {
|
||||
item.ImageURLs = urls
|
||||
}
|
||||
}
|
||||
|
||||
if publishedAt.Valid {
|
||||
item.PublishedAt = publishedAt.Time
|
||||
}
|
||||
if publishedUri.Valid {
|
||||
item.PublishedUri = publishedUri.String
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
@@ -667,10 +806,6 @@ func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
if strings.Contains(feedURL, "/comment") {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
@@ -690,6 +825,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
@@ -708,6 +844,9 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
@@ -723,11 +862,6 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Skip comment, category, and article feeds
|
||||
if skip, _ := shouldSkipFeed(feedURL); skip {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
@@ -746,6 +880,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
@@ -896,3 +1031,103 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// SetPublishStatus sets the publish status for a feed ('held', 'pass', 'fail')
|
||||
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
||||
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
||||
feedURL = normalizeURL(feedURL)
|
||||
|
||||
// Auto-derive account if passing and not provided
|
||||
if status == "pass" && account == "" {
|
||||
account = DeriveHandleFromFeed(feedURL)
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE feeds SET publishStatus = ?, publishAccount = ? WHERE url = ?
|
||||
`, status, nullString(account), feedURL)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetFeedsByPublishStatus returns all feeds with a specific publish status
|
||||
func (c *Crawler) GetFeedsByPublishStatus(status string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE publishStatus = ?
|
||||
`, status)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetPublishCandidates returns feeds that are held review and have items
|
||||
func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, category, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate,
|
||||
publishStatus, publishAccount
|
||||
FROM feeds
|
||||
WHERE publishStatus = 'held' AND itemCount > 0 AND status = 'active'
|
||||
ORDER BY itemCount DESC
|
||||
LIMIT ?
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
||||
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt,
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrls,
|
||||
publishedAt, publishedUri
|
||||
FROM items
|
||||
WHERE feedUrl = ? AND publishedAt IS NULL
|
||||
ORDER BY pubDate ASC
|
||||
LIMIT ?
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// MarkItemPublished marks an item as published with the given URI
|
||||
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE items SET publishedAt = datetime('now'), publishedUri = ? WHERE id = ?
|
||||
`, uri, itemID)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
||||
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow(`
|
||||
SELECT COUNT(*) FROM items WHERE feedUrl = ? AND publishedAt IS NULL
|
||||
`, feedURL).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user