Refactor large Go files into focused modules
Split dashboard.go (3,528 lines) into: - routes.go: HTTP route registration - api_domains.go: Domain API handlers - api_feeds.go: Feed API handlers - api_publish.go: Publishing API handlers - api_search.go: Search API handlers - templates.go: HTML templates - dashboard.go: Stats functions only (235 lines) Split publisher.go (1,502 lines) into: - pds_auth.go: Authentication and account management - pds_records.go: Record operations (upload, update, delete) - handle.go: Handle derivation from feed URLs - image.go: Image processing and favicon fetching - publisher.go: Core types and PublishItem (439 lines) Split feed.go (1,137 lines) into: - item.go: Item struct and DB operations - feed_check.go: Feed checking and processing - feed.go: Feed struct and DB operations (565 lines) Also includes domain import batch size increase (1k -> 100k). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
@@ -95,37 +89,6 @@ func classifyFeedByTitle(title string, currentCategory string) string {
|
||||
return currentCategory
|
||||
}
|
||||
|
||||
// Enclosure represents a media attachment (audio, video, image)
|
||||
type Enclosure struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||
Length int64 `json:"length"` // Size in bytes
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
type Item struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
FeedURL string `json:"feed_url"`
|
||||
GUID string `json:"guid,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Link string `json:"link,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Author string `json:"author,omitempty"`
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
|
||||
// Media attachments
|
||||
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
|
||||
|
||||
// Publishing to PDS
|
||||
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||
PublishedUri string `json:"published_uri,omitempty"`
|
||||
}
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
@@ -537,505 +500,6 @@ func scanFeeds(rows pgx.Rows) ([]*Feed, error) {
|
||||
return feeds, rows.Err()
|
||||
}
|
||||
|
||||
// saveItem stores an item in PostgreSQL (upsert by feed_url + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveItems stores multiple items efficiently
|
||||
func (c *Crawler) saveItems(items []*Item) error {
|
||||
if len(items) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
tx, err := c.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback(context.Background())
|
||||
|
||||
for _, item := range items {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
|
||||
// Serialize enclosure fields
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
if item.Enclosure != nil {
|
||||
enclosureUrl = NullableString(item.Enclosure.URL)
|
||||
enclosureType = NullableString(item.Enclosure.Type)
|
||||
if item.Enclosure.Length > 0 {
|
||||
enclosureLength = &item.Enclosure.Length
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize imageUrls as JSON
|
||||
var imageUrlsJSON *string
|
||||
if len(item.ImageURLs) > 0 {
|
||||
if data, err := json.Marshal(item.ImageURLs); err == nil {
|
||||
s := string(data)
|
||||
imageUrlsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize tags as JSON
|
||||
var tagsJSON *string
|
||||
if len(item.Tags) > 0 {
|
||||
if data, err := json.Marshal(item.Tags); err == nil {
|
||||
s := string(data)
|
||||
tagsJSON = &s
|
||||
}
|
||||
}
|
||||
|
||||
_, err := tx.Exec(context.Background(), `
|
||||
INSERT INTO items (feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT(feed_url, guid) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
link = EXCLUDED.link,
|
||||
description = EXCLUDED.description,
|
||||
content = EXCLUDED.content,
|
||||
author = EXCLUDED.author,
|
||||
pub_date = EXCLUDED.pub_date,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
enclosure_url = EXCLUDED.enclosure_url,
|
||||
enclosure_type = EXCLUDED.enclosure_type,
|
||||
enclosure_length = EXCLUDED.enclosure_length,
|
||||
image_urls = EXCLUDED.image_urls,
|
||||
tags = EXCLUDED.tags
|
||||
`,
|
||||
item.FeedURL, item.GUID, NullableString(item.Title), NullableString(item.Link),
|
||||
NullableString(item.Description), NullableString(item.Content), NullableString(item.Author),
|
||||
NullableTime(item.PubDate), item.DiscoveredAt, NullableTime(item.UpdatedAt),
|
||||
enclosureUrl, enclosureType, enclosureLength, imageUrlsJSON, tagsJSON,
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit(context.Background())
|
||||
}
|
||||
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1
|
||||
ORDER BY pub_date DESC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
tsquery := ToSearchQuery(query)
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE search_vector @@ to_tsquery('english', $1)
|
||||
ORDER BY ts_rank(search_vector, to_tsquery('english', $1)) DESC, pub_date DESC
|
||||
LIMIT $2
|
||||
`, tsquery, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// scanItems is a helper to scan multiple item rows
|
||||
func scanItems(rows pgx.Rows) ([]*Item, error) {
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author *string
|
||||
var pubDate, updatedAt, publishedAt *time.Time
|
||||
var enclosureUrl, enclosureType *string
|
||||
var enclosureLength *int64
|
||||
var imageUrlsJSON, tagsJSON *string
|
||||
var publishedUri *string
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
&enclosureUrl, &enclosureType, &enclosureLength, &imageUrlsJSON, &tagsJSON,
|
||||
&publishedAt, &publishedUri,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
item.GUID = StringValue(guid)
|
||||
item.Title = StringValue(title)
|
||||
item.Link = StringValue(link)
|
||||
item.Description = StringValue(description)
|
||||
item.Content = StringValue(content)
|
||||
item.Author = StringValue(author)
|
||||
item.PubDate = TimeValue(pubDate)
|
||||
item.UpdatedAt = TimeValue(updatedAt)
|
||||
|
||||
// Parse enclosure
|
||||
if enclosureUrl != nil && *enclosureUrl != "" {
|
||||
item.Enclosure = &Enclosure{
|
||||
URL: *enclosureUrl,
|
||||
Type: StringValue(enclosureType),
|
||||
}
|
||||
if enclosureLength != nil {
|
||||
item.Enclosure.Length = *enclosureLength
|
||||
}
|
||||
}
|
||||
|
||||
// Parse imageUrls JSON
|
||||
if imageUrlsJSON != nil && *imageUrlsJSON != "" {
|
||||
var urls []string
|
||||
if err := json.Unmarshal([]byte(*imageUrlsJSON), &urls); err == nil {
|
||||
item.ImageURLs = urls
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tags JSON
|
||||
if tagsJSON != nil && *tagsJSON != "" {
|
||||
var tags []string
|
||||
if err := json.Unmarshal([]byte(*tagsJSON), &tags); err == nil {
|
||||
item.Tags = tags
|
||||
}
|
||||
}
|
||||
|
||||
item.PublishedAt = TimeValue(publishedAt)
|
||||
item.PublishedUri = StringValue(publishedUri)
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// CleanupOldItems removes items older than 12 months
|
||||
func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
||||
result, err := c.db.Exec(`
|
||||
DELETE FROM items WHERE pub_date < $1 AND pub_date IS NOT NULL
|
||||
`, cutoff)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
feedType := c.detectFeedType(body)
|
||||
now := time.Now()
|
||||
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata and items
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Refine category based on parsed title (e.g., "Comments on:")
|
||||
feed.Category = classifyFeedByTitle(feed.Title, feed.Category)
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
normalizedURL := normalizeURL(feedURL)
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
NextCrawlAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||
// Returns: changed (bool), error
|
||||
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
atomic.AddInt32(&c.feedsChecked, 1)
|
||||
|
||||
// Try different scheme/www combinations since we store URLs without scheme
|
||||
urlVariants := []string{
|
||||
"https://" + feed.URL,
|
||||
"http://" + feed.URL,
|
||||
"https://www." + feed.URL,
|
||||
"http://www." + feed.URL,
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
var err error
|
||||
var successURL string
|
||||
|
||||
for _, tryURL := range urlVariants {
|
||||
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||
if reqErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.UserAgent)
|
||||
|
||||
// Add conditional headers if we have them
|
||||
if feed.ETag != "" {
|
||||
req.Header.Set("If-None-Match", feed.ETag)
|
||||
}
|
||||
if feed.LastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||
}
|
||||
|
||||
resp, err = c.client.Do(req)
|
||||
if err == nil {
|
||||
successURL = tryURL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
_ = successURL // May be used later for logging/debugging
|
||||
|
||||
// If no request succeeded, resp will be nil
|
||||
if resp == nil {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("all URL variants failed")
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
|
||||
// 304 Not Modified - feed hasn't changed
|
||||
if resp.StatusCode == http.StatusNotModified {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
|
||||
body := string(bodyBytes)
|
||||
|
||||
// Update cache headers
|
||||
feed.ETag = resp.Header.Get("ETag")
|
||||
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||
|
||||
// Re-detect type and parse metadata
|
||||
feedType := c.detectFeedType(body)
|
||||
feed.Type = feedType
|
||||
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
case "json":
|
||||
items = c.parseJSONFeedMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// SetPublishStatus sets the publish status for a feed ('hold', 'pass', 'skip')
|
||||
// If status is 'pass', the account handle is also set (auto-derived if empty)
|
||||
func (c *Crawler) SetPublishStatus(feedURL, status, account string) error {
|
||||
@@ -1099,39 +563,3 @@ func (c *Crawler) GetPublishCandidates(limit int) ([]*Feed, error) {
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetUnpublishedItems returns items for a feed that haven't been published yet
|
||||
func (c *Crawler) GetUnpublishedItems(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feed_url, guid, title, link, description, content, author, pub_date, discovered_at, updated_at,
|
||||
enclosure_url, enclosure_type, enclosure_length, image_urls, tags,
|
||||
published_at, published_uri
|
||||
FROM items
|
||||
WHERE feed_url = $1 AND published_at IS NULL
|
||||
ORDER BY pub_date ASC
|
||||
LIMIT $2
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanItems(rows)
|
||||
}
|
||||
|
||||
// MarkItemPublished marks an item as published with the given URI
|
||||
func (c *Crawler) MarkItemPublished(itemID int64, uri string) error {
|
||||
_, err := c.db.Exec(`
|
||||
UPDATE items SET published_at = NOW(), published_uri = $1 WHERE id = $2
|
||||
`, uri, itemID)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetUnpublishedItemCount returns the count of unpublished items for a feed
|
||||
func (c *Crawler) GetUnpublishedItemCount(feedURL string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow(`
|
||||
SELECT COUNT(*) FROM items WHERE feed_url = $1 AND published_at IS NULL
|
||||
`, feedURL).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user