Add Docker support and refactor data layer
This commit is contained in:
@@ -1,15 +1,86 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/pebble"
|
||||
)
|
||||
|
||||
// shouldSkipFeed checks if a feed URL should be filtered out
|
||||
// Returns true (and a reason) if the feed should be skipped
|
||||
func shouldSkipFeed(feedURL string) (bool, string) {
|
||||
lower := strings.ToLower(feedURL)
|
||||
|
||||
// Skip explicit comment feeds
|
||||
if strings.Contains(lower, "/comment") {
|
||||
return true, "comment feed"
|
||||
}
|
||||
|
||||
u, err := url.Parse(feedURL)
|
||||
if err != nil {
|
||||
return false, ""
|
||||
}
|
||||
|
||||
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
|
||||
|
||||
// Skip category/tag feeds
|
||||
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
|
||||
for _, pattern := range categoryPatterns {
|
||||
if strings.Contains(path, pattern) {
|
||||
return true, "category/tag feed"
|
||||
}
|
||||
}
|
||||
|
||||
// Check for article comment feeds (path ending in /feed with content before it)
|
||||
if strings.HasSuffix(path, "/feed") {
|
||||
basePath := strings.TrimSuffix(path, "/feed")
|
||||
basePath = strings.Trim(basePath, "/")
|
||||
|
||||
if basePath == "" {
|
||||
return false, "" // Just /feed - legitimate main feed
|
||||
}
|
||||
|
||||
// Skip if path contains date patterns (likely article)
|
||||
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
|
||||
return true, "article feed (date pattern)"
|
||||
}
|
||||
|
||||
// Skip if path has multiple segments (likely article or nested content)
|
||||
segments := strings.Split(basePath, "/")
|
||||
if len(segments) >= 2 {
|
||||
return true, "article feed (nested path)"
|
||||
}
|
||||
|
||||
// Skip if single segment looks like an article slug (contains hyphens, is long)
|
||||
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
|
||||
return true, "article feed (slug pattern)"
|
||||
}
|
||||
}
|
||||
|
||||
return false, ""
|
||||
}
|
||||
|
||||
// Item represents an individual entry/article from a feed
|
||||
type Item struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
FeedURL string `json:"feed_url"`
|
||||
GUID string `json:"guid,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Link string `json:"link,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Content string `json:"content,omitempty"`
|
||||
Author string `json:"author,omitempty"`
|
||||
PubDate time.Time `json:"pub_date,omitempty"`
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||
}
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
@@ -50,99 +121,548 @@ type Feed struct {
|
||||
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
||||
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
||||
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
||||
|
||||
// Adaptive check interval
|
||||
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
||||
}
|
||||
|
||||
// saveFeed stores a feed in PebbleDB
|
||||
// saveFeed stores a feed in SQLite
|
||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
data, err := json.Marshal(feed)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal feed: %v", err)
|
||||
}
|
||||
|
||||
key := []byte("feed:" + feed.URL)
|
||||
return c.db.Set(key, data, pebble.Sync)
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO feeds (
|
||||
url, type, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
type = excluded.type,
|
||||
title = excluded.title,
|
||||
description = excluded.description,
|
||||
language = excluded.language,
|
||||
siteUrl = excluded.siteUrl,
|
||||
lastCrawledAt = excluded.lastCrawledAt,
|
||||
nextCrawlAt = excluded.nextCrawlAt,
|
||||
lastBuildDate = excluded.lastBuildDate,
|
||||
etag = excluded.etag,
|
||||
lastModified = excluded.lastModified,
|
||||
ttlMinutes = excluded.ttlMinutes,
|
||||
updatePeriod = excluded.updatePeriod,
|
||||
updateFreq = excluded.updateFreq,
|
||||
status = excluded.status,
|
||||
errorCount = excluded.errorCount,
|
||||
lastError = excluded.lastError,
|
||||
lastErrorAt = excluded.lastErrorAt,
|
||||
itemCount = excluded.itemCount,
|
||||
avgPostFreqHrs = excluded.avgPostFreqHrs,
|
||||
oldestItemDate = excluded.oldestItemDate,
|
||||
newestItemDate = excluded.newestItemDate,
|
||||
noUpdate = excluded.noUpdate
|
||||
`,
|
||||
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
|
||||
nullString(feed.Language), nullString(feed.SiteURL),
|
||||
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
|
||||
nullString(feed.ETag), nullString(feed.LastModified),
|
||||
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
|
||||
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
|
||||
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
|
||||
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
|
||||
feed.NoUpdate,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// getFeed retrieves a feed from PebbleDB
|
||||
// getFeed retrieves a feed from SQLite
|
||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
key := []byte("feed:" + normalizeURL(feedURL))
|
||||
data, closer, err := c.db.Get(key)
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
|
||||
err := c.db.QueryRow(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
FROM feeds WHERE url = ?
|
||||
`, normalizeURL(feedURL)).Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
if err == pebble.ErrNotFound {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer closer.Close()
|
||||
|
||||
var feed Feed
|
||||
if err := json.Unmarshal(data, &feed); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
|
||||
// Handle nullable fields
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
return &feed, nil
|
||||
if description.Valid {
|
||||
feed.Description = description.String
|
||||
}
|
||||
if language.Valid {
|
||||
feed.Language = language.String
|
||||
}
|
||||
if siteURL.Valid {
|
||||
feed.SiteURL = siteURL.String
|
||||
}
|
||||
if lastCrawledAt.Valid {
|
||||
feed.LastCrawledAt = lastCrawledAt.Time
|
||||
}
|
||||
if nextCrawlAt.Valid {
|
||||
feed.NextCrawlAt = nextCrawlAt.Time
|
||||
}
|
||||
if lastBuildDate.Valid {
|
||||
feed.LastBuildDate = lastBuildDate.Time
|
||||
}
|
||||
if etag.Valid {
|
||||
feed.ETag = etag.String
|
||||
}
|
||||
if lastModified.Valid {
|
||||
feed.LastModified = lastModified.String
|
||||
}
|
||||
if updatePeriod.Valid {
|
||||
feed.UpdatePeriod = updatePeriod.String
|
||||
}
|
||||
if lastError.Valid {
|
||||
feed.LastError = lastError.String
|
||||
}
|
||||
if lastErrorAt.Valid {
|
||||
feed.LastErrorAt = lastErrorAt.Time
|
||||
}
|
||||
if sourceURL.Valid {
|
||||
feed.SourceURL = sourceURL.String
|
||||
}
|
||||
if sourceHost.Valid {
|
||||
feed.SourceHost = sourceHost.String
|
||||
}
|
||||
if tld.Valid {
|
||||
feed.TLD = tld.String
|
||||
}
|
||||
if avgPostFreqHrs.Valid {
|
||||
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
||||
}
|
||||
if oldestItemDate.Valid {
|
||||
feed.OldestItemDate = oldestItemDate.Time
|
||||
}
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
|
||||
// feedExists checks if a feed URL already exists in the database
|
||||
func (c *Crawler) feedExists(feedURL string) bool {
|
||||
key := []byte("feed:" + normalizeURL(feedURL))
|
||||
_, closer, err := c.db.Get(key)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
closer.Close()
|
||||
return true
|
||||
var exists bool
|
||||
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
|
||||
return err == nil && exists
|
||||
}
|
||||
|
||||
// GetAllFeeds returns all feeds from the database
|
||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||
var feeds []*Feed
|
||||
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("feed:"),
|
||||
UpperBound: []byte("feed:\xff"),
|
||||
})
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
FROM feeds
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer iter.Close()
|
||||
defer rows.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
var feed Feed
|
||||
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
|
||||
continue
|
||||
}
|
||||
feeds = append(feeds, &feed)
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return feeds, nil
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetFeedCount returns the total number of feeds in the database
|
||||
func (c *Crawler) GetFeedCount() (int, error) {
|
||||
count := 0
|
||||
var count int
|
||||
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("feed:"),
|
||||
UpperBound: []byte("feed:\xff"),
|
||||
})
|
||||
// GetFeedCountByHost returns the number of feeds for a specific host
|
||||
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
|
||||
var count int
|
||||
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
|
||||
return count, err
|
||||
}
|
||||
|
||||
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
|
||||
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
FROM feeds
|
||||
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
|
||||
ORDER BY RANDOM()
|
||||
LIMIT ?
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// GetFeedsByHost returns all feeds from a specific host
|
||||
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT url, type, title, description, language, siteUrl,
|
||||
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
|
||||
etag, lastModified,
|
||||
ttlMinutes, updatePeriod, updateFreq,
|
||||
status, errorCount, lastError, lastErrorAt,
|
||||
sourceUrl, sourceHost, tld,
|
||||
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
|
||||
noUpdate
|
||||
FROM feeds WHERE sourceHost = ?
|
||||
`, host)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// SearchFeeds performs a full-text search on feeds
|
||||
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
|
||||
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
|
||||
f.etag, f.lastModified,
|
||||
f.ttlMinutes, f.updatePeriod, f.updateFreq,
|
||||
f.status, f.errorCount, f.lastError, f.lastErrorAt,
|
||||
f.sourceUrl, f.sourceHost, f.tld,
|
||||
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
|
||||
f.noUpdate
|
||||
FROM feeds f
|
||||
JOIN feeds_fts fts ON f.rowid = fts.rowid
|
||||
WHERE feeds_fts MATCH ?
|
||||
`, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
return scanFeeds(rows)
|
||||
}
|
||||
|
||||
// scanFeeds is a helper to scan multiple feed rows
|
||||
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
|
||||
var feeds []*Feed
|
||||
|
||||
for rows.Next() {
|
||||
feed := &Feed{}
|
||||
var title, description, language, siteURL sql.NullString
|
||||
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
|
||||
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
|
||||
var avgPostFreqHrs sql.NullFloat64
|
||||
|
||||
if err := rows.Scan(
|
||||
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
|
||||
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
|
||||
&etag, &lastModified,
|
||||
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
|
||||
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
|
||||
&sourceURL, &sourceHost, &tld,
|
||||
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
|
||||
&feed.NoUpdate,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle nullable fields
|
||||
if title.Valid {
|
||||
feed.Title = title.String
|
||||
}
|
||||
if description.Valid {
|
||||
feed.Description = description.String
|
||||
}
|
||||
if language.Valid {
|
||||
feed.Language = language.String
|
||||
}
|
||||
if siteURL.Valid {
|
||||
feed.SiteURL = siteURL.String
|
||||
}
|
||||
if lastCrawledAt.Valid {
|
||||
feed.LastCrawledAt = lastCrawledAt.Time
|
||||
}
|
||||
if nextCrawlAt.Valid {
|
||||
feed.NextCrawlAt = nextCrawlAt.Time
|
||||
}
|
||||
if lastBuildDate.Valid {
|
||||
feed.LastBuildDate = lastBuildDate.Time
|
||||
}
|
||||
if etag.Valid {
|
||||
feed.ETag = etag.String
|
||||
}
|
||||
if lastModified.Valid {
|
||||
feed.LastModified = lastModified.String
|
||||
}
|
||||
if updatePeriod.Valid {
|
||||
feed.UpdatePeriod = updatePeriod.String
|
||||
}
|
||||
if lastError.Valid {
|
||||
feed.LastError = lastError.String
|
||||
}
|
||||
if lastErrorAt.Valid {
|
||||
feed.LastErrorAt = lastErrorAt.Time
|
||||
}
|
||||
if sourceURL.Valid {
|
||||
feed.SourceURL = sourceURL.String
|
||||
}
|
||||
if sourceHost.Valid {
|
||||
feed.SourceHost = sourceHost.String
|
||||
}
|
||||
if tld.Valid {
|
||||
feed.TLD = tld.String
|
||||
}
|
||||
if avgPostFreqHrs.Valid {
|
||||
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
|
||||
}
|
||||
if oldestItemDate.Valid {
|
||||
feed.OldestItemDate = oldestItemDate.Time
|
||||
}
|
||||
if newestItemDate.Valid {
|
||||
feed.NewestItemDate = newestItemDate.Time
|
||||
}
|
||||
|
||||
feeds = append(feeds, feed)
|
||||
}
|
||||
|
||||
return feeds, rows.Err()
|
||||
}
|
||||
|
||||
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
|
||||
func (c *Crawler) saveItem(item *Item) error {
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
description = excluded.description,
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
`,
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveItems stores multiple items efficiently
|
||||
func (c *Crawler) saveItems(items []*Item) error {
|
||||
if len(items) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
tx, err := c.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.Prepare(`
|
||||
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(feedUrl, guid) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
link = excluded.link,
|
||||
description = excluded.description,
|
||||
content = excluded.content,
|
||||
author = excluded.author,
|
||||
pubDate = excluded.pubDate,
|
||||
updatedAt = excluded.updatedAt
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, item := range items {
|
||||
if item == nil || item.GUID == "" {
|
||||
continue // Skip nil items or items without GUID
|
||||
}
|
||||
_, err := stmt.Exec(
|
||||
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
|
||||
nullString(item.Description), nullString(item.Content), nullString(item.Author),
|
||||
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
|
||||
)
|
||||
if err != nil {
|
||||
continue // Skip failed items
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// GetItemsByFeed returns all items for a specific feed
|
||||
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
|
||||
FROM items
|
||||
WHERE feedUrl = ?
|
||||
ORDER BY pubDate DESC
|
||||
LIMIT ?
|
||||
`, feedURL, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if guid.Valid {
|
||||
item.GUID = guid.String
|
||||
}
|
||||
if title.Valid {
|
||||
item.Title = title.String
|
||||
}
|
||||
if link.Valid {
|
||||
item.Link = link.String
|
||||
}
|
||||
if description.Valid {
|
||||
item.Description = description.String
|
||||
}
|
||||
if content.Valid {
|
||||
item.Content = content.String
|
||||
}
|
||||
if author.Valid {
|
||||
item.Author = author.String
|
||||
}
|
||||
if pubDate.Valid {
|
||||
item.PubDate = pubDate.Time
|
||||
}
|
||||
if updatedAt.Valid {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// SearchItems performs a full-text search on items
|
||||
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
|
||||
FROM items i
|
||||
JOIN items_fts fts ON i.id = fts.rowid
|
||||
WHERE items_fts MATCH ?
|
||||
ORDER BY i.pubDate DESC
|
||||
LIMIT ?
|
||||
`, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []*Item
|
||||
for rows.Next() {
|
||||
item := &Item{}
|
||||
var guid, title, link, description, content, author sql.NullString
|
||||
var pubDate, updatedAt sql.NullTime
|
||||
|
||||
if err := rows.Scan(
|
||||
&item.ID, &item.FeedURL, &guid, &title, &link,
|
||||
&description, &content, &author, &pubDate,
|
||||
&item.DiscoveredAt, &updatedAt,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if guid.Valid {
|
||||
item.GUID = guid.String
|
||||
}
|
||||
if title.Valid {
|
||||
item.Title = title.String
|
||||
}
|
||||
if link.Valid {
|
||||
item.Link = link.String
|
||||
}
|
||||
if description.Valid {
|
||||
item.Description = description.String
|
||||
}
|
||||
if content.Valid {
|
||||
item.Content = content.String
|
||||
}
|
||||
if author.Valid {
|
||||
item.Author = author.String
|
||||
}
|
||||
if pubDate.Valid {
|
||||
item.PubDate = pubDate.Time
|
||||
}
|
||||
if updatedAt.Valid {
|
||||
item.UpdatedAt = updatedAt.Time
|
||||
}
|
||||
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
// CleanupOldItems removes items older than 12 months
|
||||
func (c *Crawler) CleanupOldItems() (int64, error) {
|
||||
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
|
||||
result, err := c.db.Exec(`
|
||||
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
|
||||
`, cutoff)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer iter.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
count++
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return count, nil
|
||||
return result.RowsAffected()
|
||||
}
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
@@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata
|
||||
// Parse feed-specific metadata and items
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
c.parseRSSMetadata(body, feed)
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
c.parseAtomMetadata(body, feed)
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Calculate next crawl time
|
||||
@@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
if strings.Contains(feedURL, "/comment") {
|
||||
// Skip comment, category, and article feeds
|
||||
if skip, _ := shouldSkipFeed(feedURL); skip {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// CheckFeed performs a conditional request to check if a feed has been updated
|
||||
// Returns: changed (bool), error
|
||||
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
atomic.AddInt32(&c.feedsChecked, 1)
|
||||
|
||||
// Try different scheme/www combinations since we store URLs without scheme
|
||||
urlVariants := []string{
|
||||
"https://" + feed.URL,
|
||||
"http://" + feed.URL,
|
||||
"https://www." + feed.URL,
|
||||
"http://www." + feed.URL,
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
var err error
|
||||
var successURL string
|
||||
|
||||
for _, tryURL := range urlVariants {
|
||||
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
||||
if reqErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.UserAgent)
|
||||
|
||||
// Add conditional headers if we have them
|
||||
if feed.ETag != "" {
|
||||
req.Header.Set("If-None-Match", feed.ETag)
|
||||
}
|
||||
if feed.LastModified != "" {
|
||||
req.Header.Set("If-Modified-Since", feed.LastModified)
|
||||
}
|
||||
|
||||
resp, err = c.client.Do(req)
|
||||
if err == nil {
|
||||
successURL = tryURL
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
_ = successURL // May be used later for logging/debugging
|
||||
|
||||
// If no request succeeded, resp will be nil
|
||||
if resp == nil {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("all URL variants failed")
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
|
||||
// 304 Not Modified - feed hasn't changed
|
||||
if resp.StatusCode == http.StatusNotModified {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
}
|
||||
|
||||
body := string(bodyBytes)
|
||||
|
||||
// Update cache headers
|
||||
feed.ETag = resp.Header.Get("ETag")
|
||||
feed.LastModified = resp.Header.Get("Last-Modified")
|
||||
|
||||
// Re-detect type and parse metadata
|
||||
feedType := c.detectFeedType(body)
|
||||
feed.Type = feedType
|
||||
|
||||
var items []*Item
|
||||
switch feedType {
|
||||
case "rss":
|
||||
items = c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
items = c.parseAtomMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
if len(items) > 0 {
|
||||
c.saveItems(items)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user