commit 1ff14ee957afb81680352337a6e0d4bb97ddff35 Author: primal Date: Mon Feb 2 11:19:04 2026 -0500 Initial commit: shared Go module for 1440.news services Contains: - db.go: Database connection wrapper with helper methods - models.go: Domain, Feed, Item, ShortURL, Click structs - util.go: URL normalization, TLD functions, search helpers - handle.go: AT Protocol handle derivation from feed URLs Co-Authored-By: Claude Opus 4.5 diff --git a/db.go b/db.go new file mode 100644 index 0000000..cb9b4c7 --- /dev/null +++ b/db.go @@ -0,0 +1,156 @@ +package shared + +import ( + "context" + "fmt" + "net/url" + "os" + "strings" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" +) + +// DB wraps pgxpool.Pool with helper methods +type DB struct { + *pgxpool.Pool +} + +// OpenDatabase connects to PostgreSQL using environment variables or connection string +func OpenDatabase(connString string) (*DB, error) { + fmt.Printf("Connecting to database...\n") + + // If connection string not provided, try environment variables + if connString == "" { + connString = os.Getenv("DATABASE_URL") + } + if connString == "" { + // Build from individual env vars + host := GetEnvOrDefault("DB_HOST", "infra-postgres") + port := GetEnvOrDefault("DB_PORT", "5432") + user := GetEnvOrDefault("DB_USER", "dba_1440_news") + dbname := GetEnvOrDefault("DB_NAME", "db_1440_news") + + // Support Docker secrets (password file) or direct password + password := os.Getenv("DB_PASSWORD") + if password == "" { + if passwordFile := os.Getenv("DB_PASSWORD_FILE"); passwordFile != "" { + data, err := os.ReadFile(passwordFile) + if err != nil { + return nil, fmt.Errorf("failed to read password file: %v", err) + } + password = strings.TrimSpace(string(data)) + } + } + + connString = fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=disable", + user, url.QueryEscape(password), host, port, dbname) + } + + config, err := pgxpool.ParseConfig(connString) + if err != nil { + return nil, fmt.Errorf("failed to parse connection string: %v", err) + } + + // Connection pool settings + config.MaxConns = 10 + config.MinConns = 0 // Don't pre-create connections to avoid schema race conditions + config.MaxConnLifetime = 5 * time.Minute + config.MaxConnIdleTime = 1 * time.Minute + + ctx := context.Background() + pool, err := pgxpool.NewWithConfig(ctx, config) + if err != nil { + return nil, fmt.Errorf("failed to connect to database: %v", err) + } + + // Verify connection + if err := pool.Ping(ctx); err != nil { + pool.Close() + return nil, fmt.Errorf("failed to ping database: %v", err) + } + fmt.Println(" Connected to PostgreSQL") + + return &DB{pool}, nil +} + +// GetEnvOrDefault returns environment variable value or default +func GetEnvOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} + +// QueryRow wraps pool.QueryRow for compatibility +func (db *DB) QueryRow(query string, args ...interface{}) pgx.Row { + return db.Pool.QueryRow(context.Background(), query, args...) +} + +// Query wraps pool.Query for compatibility +func (db *DB) Query(query string, args ...interface{}) (pgx.Rows, error) { + return db.Pool.Query(context.Background(), query, args...) +} + +// Exec wraps pool.Exec for compatibility +func (db *DB) Exec(query string, args ...interface{}) (int64, error) { + result, err := db.Pool.Exec(context.Background(), query, args...) + if err != nil { + return 0, err + } + return result.RowsAffected(), nil +} + +// Begin starts a transaction +func (db *DB) Begin() (pgx.Tx, error) { + return db.Pool.Begin(context.Background()) +} + +// Close closes the connection pool +func (db *DB) Close() error { + db.Pool.Close() + return nil +} + +// NullableString returns nil for empty strings, otherwise the string pointer +func NullableString(s string) *string { + if s == "" { + return nil + } + return &s +} + +// NullableTime returns nil for zero times, otherwise the time pointer +func NullableTime(t time.Time) *time.Time { + if t.IsZero() { + return nil + } + return &t +} + +// StringValue returns empty string for nil, otherwise the dereferenced value +func StringValue(s *string) string { + if s == nil { + return "" + } + return *s +} + +// TimeValue returns zero time for nil, otherwise the dereferenced value +func TimeValue(t *time.Time) time.Time { + if t == nil { + return time.Time{} + } + return *t +} + +// ToSearchQuery converts a user query to PostgreSQL tsquery format +func ToSearchQuery(query string) string { + // Simple conversion: split on spaces and join with & + words := strings.Fields(query) + if len(words) == 0 { + return "" + } + return strings.Join(words, " & ") +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..aab0719 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/1440news/shared + +go 1.24.0 + +require github.com/jackc/pgx/v5 v5.7.5 + +require ( + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect + golang.org/x/crypto v0.47.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/text v0.33.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0c35200 --- /dev/null +++ b/go.sum @@ -0,0 +1,28 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.7.5 h1:JHGfMnQY+IEtGM63d+NGMjoRpysB2JBwDr5fsngwmJs= +github.com/jackc/pgx/v5 v5.7.5/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= +golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/handle.go b/handle.go new file mode 100644 index 0000000..bd0ec29 --- /dev/null +++ b/handle.go @@ -0,0 +1,262 @@ +package shared + +import ( + "net/url" + "regexp" + "strings" +) + +// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL +// Format: {domain}-{category}.1440.news +// AT Protocol allows up to 63 characters per label, but the PDS +// restricts the first segment to 18 characters for local handles. +// Examples: +// +// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news +// news.ycombinator.com/rss → ycombinator.1440.news +func DeriveHandleFromFeed(feedURL string) string { + const maxSubdomainLen = 18 // PDS limit for first segment + + // Ensure we have a scheme for parsing + if !strings.Contains(feedURL, "://") { + feedURL = "https://" + feedURL + } + + u, err := url.Parse(feedURL) + if err != nil { + return "" + } + + hostname := strings.ToLower(u.Hostname()) + path := strings.ToLower(u.Path) + + // Remove common feed suffixes/extensions + suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"} + for _, suffix := range suffixesToRemove { + path = strings.TrimSuffix(path, suffix) + } + + // Split path into segments and filter noise + segments := strings.Split(strings.Trim(path, "/"), "/") + skipPathWords := map[string]bool{ + "rss": true, "feed": true, "feeds": true, "atom": true, + "xml": true, "default": true, "index": true, "services": true, + "nyt": true, + } + + var pathParts []string + for _, seg := range segments { + seg = cleanHandleSegment(seg) + if seg != "" && !skipPathWords[seg] { + pathParts = append(pathParts, seg) + } + } + + // Split hostname and extract the meaningful domain + hostParts := strings.Split(hostname, ".") + + // Two-part TLDs to handle specially + twoPartTLDs := map[string]bool{ + "co.uk": true, "com.au": true, "co.nz": true, "co.jp": true, + "com.br": true, "co.in": true, "org.uk": true, "ac.uk": true, + } + + // Check for two-part TLD + if len(hostParts) >= 2 { + possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1] + if twoPartTLDs[possibleTwoPartTLD] { + hostParts = hostParts[:len(hostParts)-2] + } else { + // Single TLD - remove it + singleTLDs := map[string]bool{ + "com": true, "org": true, "net": true, "io": true, + "edu": true, "gov": true, "uk": true, "de": true, "fr": true, + } + if singleTLDs[hostParts[len(hostParts)-1]] { + hostParts = hostParts[:len(hostParts)-1] + } + } + } + + // Skip noise subdomains + skipHostWords := map[string]bool{ + "www": true, "feeds": true, "rss": true, "feed": true, + "api": true, "cdn": true, "static": true, "news": true, + } + + var meaningfulHostParts []string + for _, part := range hostParts { + if !skipHostWords[part] && part != "" { + meaningfulHostParts = append(meaningfulHostParts, part) + } + } + + // Get the main domain (e.g., "bbci", "ycombinator", "nytimes") + var mainDomain string + if len(meaningfulHostParts) > 0 { + mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1] + } else if len(hostParts) > 0 { + mainDomain = hostParts[len(hostParts)-1] + } + + // Special case: "bbci" should become "bbc" + if mainDomain == "bbci" { + mainDomain = "bbc" + } + + // Abbreviations for long category names to fit 18-char limit + categoryAbbrevs := map[string]string{ + "science-and-environment": "sci-env", + "entertainment-and-arts": "ent-arts", + "science-environment": "sci-env", + "entertainment-arts": "ent-arts", + "technology": "tech", + "business": "biz", + "international": "intl", + "environment": "env", + "entertainment": "ent", + "politics": "pol", + } + + // Build subdomain: domain + category (from path) + var subdomain string + if len(pathParts) > 0 { + // Use last meaningful path part as category (e.g., "technology" from /news/technology/) + category := pathParts[len(pathParts)-1] + // Skip generic categories + if category == "news" && len(pathParts) == 1 { + subdomain = mainDomain + } else { + // Try to abbreviate if the full subdomain would be too long + fullSubdomain := mainDomain + "-" + category + if len(fullSubdomain) > maxSubdomainLen { + if abbrev, ok := categoryAbbrevs[category]; ok { + category = abbrev + } + } + subdomain = mainDomain + "-" + category + } + } else { + subdomain = mainDomain + } + + // If still too long, just use main hostname + if len(subdomain) > maxSubdomainLen { + subdomain = mainDomain + } + + // Final safety: truncate if still too long + if len(subdomain) > maxSubdomainLen { + subdomain = subdomain[:maxSubdomainLen] + } + + subdomain = strings.Trim(subdomain, "-") + + // Collapse multiple hyphens + for strings.Contains(subdomain, "--") { + subdomain = strings.ReplaceAll(subdomain, "--", "-") + } + + return subdomain + ".1440.news" +} + +// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment +// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens +func cleanHandleSegment(s string) string { + // Remove file extensions + if idx := strings.LastIndex(s, "."); idx > 0 { + s = s[:idx] + } + + // Convert to lowercase + s = strings.ToLower(s) + + // Strip common feed prefixes/suffixes from the segment itself + // e.g., "showrss" → "show", "rssworld" → "world" + feedAffixes := []string{"rss", "feed", "atom", "xml"} + for _, affix := range feedAffixes { + // Strip suffix (e.g., "showrss" → "show") + if strings.HasSuffix(s, affix) && len(s) > len(affix) { + s = strings.TrimSuffix(s, affix) + break + } + // Strip prefix (e.g., "rssworld" → "world") + if strings.HasPrefix(s, affix) && len(s) > len(affix) { + s = strings.TrimPrefix(s, affix) + break + } + } + + // Replace underscores and other separators with hyphens + s = strings.ReplaceAll(s, "_", "-") + s = strings.ReplaceAll(s, " ", "-") + + // Remove any characters that aren't alphanumeric or hyphens + reg := regexp.MustCompile(`[^a-z0-9-]`) + s = reg.ReplaceAllString(s, "") + + // Collapse multiple hyphens + for strings.Contains(s, "--") { + s = strings.ReplaceAll(s, "--", "-") + } + + // Trim leading/trailing hyphens + s = strings.Trim(s, "-") + + return s +} + +// SplitHandle extracts the path prefix and hostname from a derived handle +// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com") +func SplitHandle(handle string) (prefix string, hostname string) { + // Remove .1440.news suffix + handle = strings.TrimSuffix(handle, ".1440.news") + + parts := strings.Split(handle, ".") + + // Try to find where hostname starts by looking for valid hostname patterns + if len(parts) >= 2 { + for i := 0; i < len(parts)-1; i++ { + remaining := strings.Join(parts[i:], ".") + if looksLikeHostname(remaining) { + if i > 0 { + prefix = strings.Join(parts[:i], ".") + } + hostname = remaining + return + } + } + } + + // Fallback: no prefix, entire thing is hostname + hostname = handle + return "", hostname +} + +func isLikelyTLDPart(s string) bool { + tlds := map[string]bool{ + "com": true, "org": true, "net": true, "edu": true, "gov": true, + "io": true, "co": true, "uk": true, "de": true, "fr": true, + "jp": true, "au": true, "ca": true, "nl": true, "se": true, + "news": true, "blog": true, "tech": true, "dev": true, + } + return tlds[s] +} + +func isTwoPartTLD(first, second string) bool { + twoPartTLDs := map[string]bool{ + "co.uk": true, "com.au": true, "co.jp": true, "co.nz": true, + "org.uk": true, "net.au": true, "com.br": true, + } + return twoPartTLDs[first+"."+second] +} + +func looksLikeHostname(s string) bool { + // A hostname typically has at least one dot and ends with a TLD-like part + parts := strings.Split(s, ".") + if len(parts) < 2 { + return false + } + lastPart := parts[len(parts)-1] + return isLikelyTLDPart(lastPart) +} diff --git a/models.go b/models.go new file mode 100644 index 0000000..430ea8a --- /dev/null +++ b/models.go @@ -0,0 +1,178 @@ +package shared + +import ( + "time" +) + +// Domain represents a host to process for feeds +// Status: hold (pending review), pass (approved), skip (not processing), dead (retired TLD) +// CrawledAt: zero time = needs domain_check, +1 sec = needs feed_crawl, real time = done +type Domain struct { + Host string `json:"host"` + Status string `json:"status"` + CrawledAt time.Time `json:"crawled_at"` + FeedsFound int `json:"feeds_found,omitempty"` + LastError string `json:"last_error,omitempty"` + TLD string `json:"tld,omitempty"` + MissCount int `json:"miss_count,omitempty"` +} + +// MissCountThreshold is the number of consecutive errors before setting status to hold +const MissCountThreshold = 100 + +// Sentinel values for domain processing state +var ( + DomainStateUnchecked = time.Time{} // 0001-01-01 00:00:00 - needs domain_check + DomainStateChecked = time.Time{}.Add(time.Second) // 0001-01-01 00:00:01 - needs feed_crawl +) + +// FullHost returns the complete hostname (host + tld) +func (d *Domain) FullHost() string { + return FullHost(d.Host, d.TLD) +} + +// Feed represents a discovered RSS/Atom feed with metadata +type Feed struct { + URL string `json:"url"` + Type string `json:"type"` // "rss", "atom", "json", or "unknown" + Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast" + Title string `json:"title,omitempty"` + Description string `json:"description,omitempty"` + Language string `json:"language,omitempty"` + SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to + + // Timing + DiscoveredAt time.Time `json:"discovered_at"` + LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked + NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check + LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated + + // Cache headers for conditional requests + ETag string `json:"etag,omitempty"` + LastModified string `json:"last_modified,omitempty"` + + // Health tracking + Status string `json:"status"` // "pass", "hold", "skip" + LastError string `json:"last_error,omitempty"` + LastErrorAt time.Time `json:"last_error_at,omitempty"` + + // Discovery source + SourceURL string `json:"source_url,omitempty"` + DomainHost string `json:"domain_host,omitempty"` + DomainTLD string `json:"domain_tld,omitempty"` + + // Content stats + ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check + OldestItemDate time.Time `json:"oldest_item_date,omitempty"` + NewestItemDate time.Time `json:"newest_item_date,omitempty"` + + // Adaptive check interval + NoUpdate int `json:"no_update"` // Consecutive checks with no change + + // Publishing to PDS + PublishStatus string `json:"publish_status"` // "hold", "pass", "skip" + PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news" +} + +// Enclosure represents a media attachment (audio, video, image) +type Enclosure struct { + URL string `json:"url"` + Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.) + Length int64 `json:"length"` // Size in bytes +} + +// Item represents an individual entry/article from a feed +type Item struct { + FeedURL string `json:"feed_url"` + GUID string `json:"guid,omitempty"` + Title string `json:"title,omitempty"` + Link string `json:"link,omitempty"` + Description string `json:"description,omitempty"` + Content string `json:"content,omitempty"` + Author string `json:"author,omitempty"` + PubDate time.Time `json:"pub_date,omitempty"` + DiscoveredAt time.Time `json:"discovered_at"` + UpdatedAt time.Time `json:"updated_at,omitempty"` + + // Media attachments + Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.) + ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content + Tags []string `json:"tags,omitempty"` // Category/tag strings from feed + + // Publishing to PDS + PublishedAt time.Time `json:"published_at,omitempty"` + PublishedUri string `json:"published_uri,omitempty"` +} + +// ShortURL represents a shortened URL mapping +type ShortURL struct { + Code string `json:"code"` + OriginalURL string `json:"original_url"` + ItemGUID string `json:"item_guid,omitempty"` + FeedURL string `json:"feed_url,omitempty"` + CreatedAt time.Time `json:"created_at"` + ClickCount int `json:"click_count"` +} + +// Click represents a click event on a short URL +type Click struct { + ID int64 `json:"id"` + ShortCode string `json:"short_code"` + ClickedAt time.Time `json:"clicked_at"` + Referrer string `json:"referrer,omitempty"` + UserAgent string `json:"user_agent,omitempty"` + IPHash string `json:"ip_hash,omitempty"` + Country string `json:"country,omitempty"` +} + +// DashboardStats holds all statistics for the dashboard +type DashboardStats struct { + // Domain stats + TotalDomains int `json:"total_domains"` + HoldDomains int `json:"hold_domains"` + PassDomains int `json:"pass_domains"` + SkipDomains int `json:"skip_domains"` + DeadDomains int `json:"dead_domains"` + + // Feed stats + TotalFeeds int `json:"total_feeds"` + AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds) + PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing) + SkipFeeds int `json:"skip_feeds"` + HoldFeeds int `json:"hold_feeds"` + DeadFeeds int `json:"dead_feeds"` + EmptyFeeds int `json:"empty_feeds"` + RSSFeeds int `json:"rss_feeds"` + AtomFeeds int `json:"atom_feeds"` + JSONFeeds int `json:"json_feeds"` + UnknownFeeds int `json:"unknown_feeds"` + + // Processing rates (per minute) + DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count + DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute + FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute + FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute + + // Timing + UpdatedAt time.Time `json:"updated_at"` +} + +// TLDStat holds TLD statistics +type TLDStat struct { + TLD string `json:"tld"` + Count int `json:"count"` +} + +// DomainStat holds domain statistics +type DomainStat struct { + Host string `json:"host"` + FeedsFound int `json:"feeds_found"` +} + +// FeedInfo holds basic feed metadata for profile setup +type FeedInfo struct { + Title string + Description string + SiteURL string + SourceHost string +} diff --git a/util.go b/util.go new file mode 100644 index 0000000..00e9d3b --- /dev/null +++ b/util.go @@ -0,0 +1,242 @@ +package shared + +import ( + "net/url" + "strings" +) + +// NormalizeURL strips scheme (http/https) and www. prefix to save storage space. +// The normalized URL can be reconstructed with https:// for fetching. +func NormalizeURL(rawURL string) string { + // Remove scheme + u := rawURL + if strings.HasPrefix(u, "https://") { + u = u[8:] + } else if strings.HasPrefix(u, "http://") { + u = u[7:] + } + + // Remove www. prefix + if strings.HasPrefix(u, "www.") { + u = u[4:] + } + + return u +} + +// NormalizeHost strips www. prefix from a hostname for canonical storage +func NormalizeHost(host string) string { + if strings.HasPrefix(host, "www.") { + return host[4:] + } + return host +} + +// ReverseHost converts a reverse domain notation back to normal +// e.g., "com.example.www" -> "www.example.com" +func ReverseHost(reverseHost string) string { + parts := strings.Split(reverseHost, ".") + for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { + parts[i], parts[j] = parts[j], parts[i] + } + return strings.Join(parts, ".") +} + +// GetTLD extracts the TLD from a hostname +func GetTLD(host string) string { + parts := strings.Split(host, ".") + if len(parts) > 0 { + return parts[len(parts)-1] + } + return "" +} + +// StripTLD removes the TLD suffix from a hostname +// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example" +func StripTLD(host string) string { + idx := strings.LastIndex(host, ".") + if idx > 0 { + return host[:idx] + } + return host +} + +// GetDomainHost extracts the host part from a full domain (without TLD) +// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co" +func GetDomainHost(domain string) string { + return StripTLD(domain) +} + +// FullHost reconstructs the full hostname from host and tld +// e.g., ("example", "com") -> "example.com" +func FullHost(host, tld string) string { + if tld == "" { + return host + } + return host + "." + tld +} + +// MakeAbsoluteURL resolves a relative URL against a base URL +func MakeAbsoluteURL(href, baseURL string) string { + base, err := url.Parse(baseURL) + if err != nil { + return href + } + + link, err := url.Parse(href) + if err != nil { + return href + } + + return base.ResolveReference(link).String() +} + +// SearchQuery represents a parsed search with optional type prefix +type SearchQuery struct { + Type string // "all", "domain", "url", "title", "description", "item" + Pattern string // the search pattern (without prefix) + ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly) + // For "all" type searches that look like domains, these are populated for additional exact matching + DomainHost string // e.g., "npr" from "npr.org" + DomainTLD string // e.g., "org" from "npr.org" +} + +// ParseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:" +// Returns SearchQuery with Type and Pattern +// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern), +// +// "url" (f:), "title" (t:), "description" (s:), "item" (i:) +func ParseSearchPrefix(query string) SearchQuery { + query = strings.TrimSpace(query) + if query == "" { + return SearchQuery{Type: "all", Pattern: ""} + } + + // Check for prefixes (case-insensitive) + lower := strings.ToLower(query) + if strings.HasPrefix(lower, "a:") { + return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "d:") { + return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "f:") { + return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "t:") { + return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "s:") { + return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])} + } + if strings.HasPrefix(lower, "i:") { + return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])} + } + + // For "all" type, check if pattern looks like a domain and extract host/tld + result := SearchQuery{Type: "all", Pattern: query} + if LooksLikeDomain(query) { + host, tld := ParseSearchTerm(query) + if tld != "" { + result.DomainHost = host + result.DomainTLD = tld + } + } + return result +} + +// LooksLikeDomain checks if a query looks like a domain name +func LooksLikeDomain(query string) bool { + if query == "" || strings.Contains(query, " ") { + return false + } + // Must have at least one dot + lastDot := strings.LastIndex(query, ".") + if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 { + return false + } + // TLD must be 2-6 lowercase letters + tld := query[lastDot+1:] + if len(tld) < 2 || len(tld) > 6 { + return false + } + for _, c := range tld { + if c < 'a' || c > 'z' { + if c < 'A' || c > 'Z' { + return false + } + } + } + return true +} + +// ParseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter. +// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them. +// Returns (hostPattern, tldFilter) where tldFilter may be empty. +func ParseSearchTerm(search string) (hostPattern, tldFilter string) { + search = strings.TrimSpace(search) + if search == "" { + return "", "" + } + + // Check if search contains a dot + lastDot := strings.LastIndex(search, ".") + if lastDot == -1 || lastDot == len(search)-1 { + // No dot or ends with dot - treat as host-only search + return search, "" + } + + // Extract potential TLD (part after last dot) + potentialTLD := strings.ToLower(search[lastDot+1:]) + hostPart := search[:lastDot] + + // Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs) + if len(potentialTLD) < 2 || len(potentialTLD) > 24 { + return search, "" + } + for _, c := range potentialTLD { + if c < 'a' || c > 'z' { + // Contains non-letter, not a TLD + return search, "" + } + } + + // Looks like a valid TLD pattern + return hostPart, potentialTLD +} + +// ShouldCrawl checks if a link should be crawled (same host as base) +func ShouldCrawl(link, baseURL string) bool { + linkURL, err := url.Parse(link) + if err != nil { + return false + } + + baseURLParsed, err := url.Parse(baseURL) + if err != nil { + return false + } + + return linkURL.Host == baseURLParsed.Host +} + +// ShouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns +func ShouldAutoSkipDomain(host string) bool { + // Never skip our own domain + if strings.HasSuffix(host, "1440.news") || host == "1440.news" { + return false + } + // Skip bare TLDs (no dot means it's just "com", "net", etc.) + if !strings.Contains(host, ".") { + return true + } + // Skip domains starting with a digit (spam pattern) + if len(host) > 0 && host[0] >= '0' && host[0] <= '9' { + return true + } + // Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com") + if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' { + return true + } + return false +}