Initial commit: shared Go module for 1440.news services

Contains:
- db.go: Database connection wrapper with helper methods
- models.go: Domain, Feed, Item, ShortURL, Click structs
- util.go: URL normalization, TLD functions, search helpers
- handle.go: AT Protocol handle derivation from feed URLs

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-02-02 11:19:04 -05:00
commit 1ff14ee957
6 changed files with 880 additions and 0 deletions
+156
View File
@@ -0,0 +1,156 @@
package shared
import (
"context"
"fmt"
"net/url"
"os"
"strings"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
)
// DB wraps pgxpool.Pool with helper methods
type DB struct {
*pgxpool.Pool
}
// OpenDatabase connects to PostgreSQL using environment variables or connection string
func OpenDatabase(connString string) (*DB, error) {
fmt.Printf("Connecting to database...\n")
// If connection string not provided, try environment variables
if connString == "" {
connString = os.Getenv("DATABASE_URL")
}
if connString == "" {
// Build from individual env vars
host := GetEnvOrDefault("DB_HOST", "infra-postgres")
port := GetEnvOrDefault("DB_PORT", "5432")
user := GetEnvOrDefault("DB_USER", "dba_1440_news")
dbname := GetEnvOrDefault("DB_NAME", "db_1440_news")
// Support Docker secrets (password file) or direct password
password := os.Getenv("DB_PASSWORD")
if password == "" {
if passwordFile := os.Getenv("DB_PASSWORD_FILE"); passwordFile != "" {
data, err := os.ReadFile(passwordFile)
if err != nil {
return nil, fmt.Errorf("failed to read password file: %v", err)
}
password = strings.TrimSpace(string(data))
}
}
connString = fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=disable",
user, url.QueryEscape(password), host, port, dbname)
}
config, err := pgxpool.ParseConfig(connString)
if err != nil {
return nil, fmt.Errorf("failed to parse connection string: %v", err)
}
// Connection pool settings
config.MaxConns = 10
config.MinConns = 0 // Don't pre-create connections to avoid schema race conditions
config.MaxConnLifetime = 5 * time.Minute
config.MaxConnIdleTime = 1 * time.Minute
ctx := context.Background()
pool, err := pgxpool.NewWithConfig(ctx, config)
if err != nil {
return nil, fmt.Errorf("failed to connect to database: %v", err)
}
// Verify connection
if err := pool.Ping(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("failed to ping database: %v", err)
}
fmt.Println(" Connected to PostgreSQL")
return &DB{pool}, nil
}
// GetEnvOrDefault returns environment variable value or default
func GetEnvOrDefault(key, defaultVal string) string {
if val := os.Getenv(key); val != "" {
return val
}
return defaultVal
}
// QueryRow wraps pool.QueryRow for compatibility
func (db *DB) QueryRow(query string, args ...interface{}) pgx.Row {
return db.Pool.QueryRow(context.Background(), query, args...)
}
// Query wraps pool.Query for compatibility
func (db *DB) Query(query string, args ...interface{}) (pgx.Rows, error) {
return db.Pool.Query(context.Background(), query, args...)
}
// Exec wraps pool.Exec for compatibility
func (db *DB) Exec(query string, args ...interface{}) (int64, error) {
result, err := db.Pool.Exec(context.Background(), query, args...)
if err != nil {
return 0, err
}
return result.RowsAffected(), nil
}
// Begin starts a transaction
func (db *DB) Begin() (pgx.Tx, error) {
return db.Pool.Begin(context.Background())
}
// Close closes the connection pool
func (db *DB) Close() error {
db.Pool.Close()
return nil
}
// NullableString returns nil for empty strings, otherwise the string pointer
func NullableString(s string) *string {
if s == "" {
return nil
}
return &s
}
// NullableTime returns nil for zero times, otherwise the time pointer
func NullableTime(t time.Time) *time.Time {
if t.IsZero() {
return nil
}
return &t
}
// StringValue returns empty string for nil, otherwise the dereferenced value
func StringValue(s *string) string {
if s == nil {
return ""
}
return *s
}
// TimeValue returns zero time for nil, otherwise the dereferenced value
func TimeValue(t *time.Time) time.Time {
if t == nil {
return time.Time{}
}
return *t
}
// ToSearchQuery converts a user query to PostgreSQL tsquery format
func ToSearchQuery(query string) string {
// Simple conversion: split on spaces and join with &
words := strings.Fields(query)
if len(words) == 0 {
return ""
}
return strings.Join(words, " & ")
}
+14
View File
@@ -0,0 +1,14 @@
module github.com/1440news/shared
go 1.24.0
require github.com/jackc/pgx/v5 v5.7.5
require (
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/text v0.33.0 // indirect
)
+28
View File
@@ -0,0 +1,28 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.5 h1:JHGfMnQY+IEtGM63d+NGMjoRpysB2JBwDr5fsngwmJs=
github.com/jackc/pgx/v5 v5.7.5/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+262
View File
@@ -0,0 +1,262 @@
package shared
import (
"net/url"
"regexp"
"strings"
)
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {domain}-{category}.1440.news
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
//
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18 // PDS limit for first segment
// Ensure we have a scheme for parsing
if !strings.Contains(feedURL, "://") {
feedURL = "https://" + feedURL
}
u, err := url.Parse(feedURL)
if err != nil {
return ""
}
hostname := strings.ToLower(u.Hostname())
path := strings.ToLower(u.Path)
// Remove common feed suffixes/extensions
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
for _, suffix := range suffixesToRemove {
path = strings.TrimSuffix(path, suffix)
}
// Split path into segments and filter noise
segments := strings.Split(strings.Trim(path, "/"), "/")
skipPathWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true,
}
var pathParts []string
for _, seg := range segments {
seg = cleanHandleSegment(seg)
if seg != "" && !skipPathWords[seg] {
pathParts = append(pathParts, seg)
}
}
// Split hostname and extract the meaningful domain
hostParts := strings.Split(hostname, ".")
// Two-part TLDs to handle specially
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
}
// Check for two-part TLD
if len(hostParts) >= 2 {
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
if twoPartTLDs[possibleTwoPartTLD] {
hostParts = hostParts[:len(hostParts)-2]
} else {
// Single TLD - remove it
singleTLDs := map[string]bool{
"com": true, "org": true, "net": true, "io": true,
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
}
if singleTLDs[hostParts[len(hostParts)-1]] {
hostParts = hostParts[:len(hostParts)-1]
}
}
}
// Skip noise subdomains
skipHostWords := map[string]bool{
"www": true, "feeds": true, "rss": true, "feed": true,
"api": true, "cdn": true, "static": true, "news": true,
}
var meaningfulHostParts []string
for _, part := range hostParts {
if !skipHostWords[part] && part != "" {
meaningfulHostParts = append(meaningfulHostParts, part)
}
}
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
var mainDomain string
if len(meaningfulHostParts) > 0 {
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
} else if len(hostParts) > 0 {
mainDomain = hostParts[len(hostParts)-1]
}
// Special case: "bbci" should become "bbc"
if mainDomain == "bbci" {
mainDomain = "bbc"
}
// Abbreviations for long category names to fit 18-char limit
categoryAbbrevs := map[string]string{
"science-and-environment": "sci-env",
"entertainment-and-arts": "ent-arts",
"science-environment": "sci-env",
"entertainment-arts": "ent-arts",
"technology": "tech",
"business": "biz",
"international": "intl",
"environment": "env",
"entertainment": "ent",
"politics": "pol",
}
// Build subdomain: domain + category (from path)
var subdomain string
if len(pathParts) > 0 {
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
category := pathParts[len(pathParts)-1]
// Skip generic categories
if category == "news" && len(pathParts) == 1 {
subdomain = mainDomain
} else {
// Try to abbreviate if the full subdomain would be too long
fullSubdomain := mainDomain + "-" + category
if len(fullSubdomain) > maxSubdomainLen {
if abbrev, ok := categoryAbbrevs[category]; ok {
category = abbrev
}
}
subdomain = mainDomain + "-" + category
}
} else {
subdomain = mainDomain
}
// If still too long, just use main hostname
if len(subdomain) > maxSubdomainLen {
subdomain = mainDomain
}
// Final safety: truncate if still too long
if len(subdomain) > maxSubdomainLen {
subdomain = subdomain[:maxSubdomainLen]
}
subdomain = strings.Trim(subdomain, "-")
// Collapse multiple hyphens
for strings.Contains(subdomain, "--") {
subdomain = strings.ReplaceAll(subdomain, "--", "-")
}
return subdomain + ".1440.news"
}
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
// Remove file extensions
if idx := strings.LastIndex(s, "."); idx > 0 {
s = s[:idx]
}
// Convert to lowercase
s = strings.ToLower(s)
// Strip common feed prefixes/suffixes from the segment itself
// e.g., "showrss" → "show", "rssworld" → "world"
feedAffixes := []string{"rss", "feed", "atom", "xml"}
for _, affix := range feedAffixes {
// Strip suffix (e.g., "showrss" → "show")
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
s = strings.TrimSuffix(s, affix)
break
}
// Strip prefix (e.g., "rssworld" → "world")
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
s = strings.TrimPrefix(s, affix)
break
}
}
// Replace underscores and other separators with hyphens
s = strings.ReplaceAll(s, "_", "-")
s = strings.ReplaceAll(s, " ", "-")
// Remove any characters that aren't alphanumeric or hyphens
reg := regexp.MustCompile(`[^a-z0-9-]`)
s = reg.ReplaceAllString(s, "")
// Collapse multiple hyphens
for strings.Contains(s, "--") {
s = strings.ReplaceAll(s, "--", "-")
}
// Trim leading/trailing hyphens
s = strings.Trim(s, "-")
return s
}
// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
// Remove .1440.news suffix
handle = strings.TrimSuffix(handle, ".1440.news")
parts := strings.Split(handle, ".")
// Try to find where hostname starts by looking for valid hostname patterns
if len(parts) >= 2 {
for i := 0; i < len(parts)-1; i++ {
remaining := strings.Join(parts[i:], ".")
if looksLikeHostname(remaining) {
if i > 0 {
prefix = strings.Join(parts[:i], ".")
}
hostname = remaining
return
}
}
}
// Fallback: no prefix, entire thing is hostname
hostname = handle
return "", hostname
}
func isLikelyTLDPart(s string) bool {
tlds := map[string]bool{
"com": true, "org": true, "net": true, "edu": true, "gov": true,
"io": true, "co": true, "uk": true, "de": true, "fr": true,
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
"news": true, "blog": true, "tech": true, "dev": true,
}
return tlds[s]
}
func isTwoPartTLD(first, second string) bool {
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
"org.uk": true, "net.au": true, "com.br": true,
}
return twoPartTLDs[first+"."+second]
}
func looksLikeHostname(s string) bool {
// A hostname typically has at least one dot and ends with a TLD-like part
parts := strings.Split(s, ".")
if len(parts) < 2 {
return false
}
lastPart := parts[len(parts)-1]
return isLikelyTLDPart(lastPart)
}
+178
View File
@@ -0,0 +1,178 @@
package shared
import (
"time"
)
// Domain represents a host to process for feeds
// Status: hold (pending review), pass (approved), skip (not processing), dead (retired TLD)
// CrawledAt: zero time = needs domain_check, +1 sec = needs feed_crawl, real time = done
type Domain struct {
Host string `json:"host"`
Status string `json:"status"`
CrawledAt time.Time `json:"crawled_at"`
FeedsFound int `json:"feeds_found,omitempty"`
LastError string `json:"last_error,omitempty"`
TLD string `json:"tld,omitempty"`
MissCount int `json:"miss_count,omitempty"`
}
// MissCountThreshold is the number of consecutive errors before setting status to hold
const MissCountThreshold = 100
// Sentinel values for domain processing state
var (
DomainStateUnchecked = time.Time{} // 0001-01-01 00:00:00 - needs domain_check
DomainStateChecked = time.Time{}.Add(time.Second) // 0001-01-01 00:00:01 - needs feed_crawl
)
// FullHost returns the complete hostname (host + tld)
func (d *Domain) FullHost() string {
return FullHost(d.Host, d.TLD)
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
Type string `json:"type"` // "rss", "atom", "json", or "unknown"
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
// Timing
DiscoveredAt time.Time `json:"discovered_at"`
LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked
NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
// Cache headers for conditional requests
ETag string `json:"etag,omitempty"`
LastModified string `json:"last_modified,omitempty"`
// Health tracking
Status string `json:"status"` // "pass", "hold", "skip"
LastError string `json:"last_error,omitempty"`
LastErrorAt time.Time `json:"last_error_at,omitempty"`
// Discovery source
SourceURL string `json:"source_url,omitempty"`
DomainHost string `json:"domain_host,omitempty"`
DomainTLD string `json:"domain_tld,omitempty"`
// Content stats
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
// Publishing to PDS
PublishStatus string `json:"publish_status"` // "hold", "pass", "skip"
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
}
// Enclosure represents a media attachment (audio, video, image)
type Enclosure struct {
URL string `json:"url"`
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
Length int64 `json:"length"` // Size in bytes
}
// Item represents an individual entry/article from a feed
type Item struct {
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
// Media attachments
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
// Publishing to PDS
PublishedAt time.Time `json:"published_at,omitempty"`
PublishedUri string `json:"published_uri,omitempty"`
}
// ShortURL represents a shortened URL mapping
type ShortURL struct {
Code string `json:"code"`
OriginalURL string `json:"original_url"`
ItemGUID string `json:"item_guid,omitempty"`
FeedURL string `json:"feed_url,omitempty"`
CreatedAt time.Time `json:"created_at"`
ClickCount int `json:"click_count"`
}
// Click represents a click event on a short URL
type Click struct {
ID int64 `json:"id"`
ShortCode string `json:"short_code"`
ClickedAt time.Time `json:"clicked_at"`
Referrer string `json:"referrer,omitempty"`
UserAgent string `json:"user_agent,omitempty"`
IPHash string `json:"ip_hash,omitempty"`
Country string `json:"country,omitempty"`
}
// DashboardStats holds all statistics for the dashboard
type DashboardStats struct {
// Domain stats
TotalDomains int `json:"total_domains"`
HoldDomains int `json:"hold_domains"`
PassDomains int `json:"pass_domains"`
SkipDomains int `json:"skip_domains"`
DeadDomains int `json:"dead_domains"`
// Feed stats
TotalFeeds int `json:"total_feeds"`
AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds)
PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing)
SkipFeeds int `json:"skip_feeds"`
HoldFeeds int `json:"hold_feeds"`
DeadFeeds int `json:"dead_feeds"`
EmptyFeeds int `json:"empty_feeds"`
RSSFeeds int `json:"rss_feeds"`
AtomFeeds int `json:"atom_feeds"`
JSONFeeds int `json:"json_feeds"`
UnknownFeeds int `json:"unknown_feeds"`
// Processing rates (per minute)
DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count
DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute
FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute
FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute
// Timing
UpdatedAt time.Time `json:"updated_at"`
}
// TLDStat holds TLD statistics
type TLDStat struct {
TLD string `json:"tld"`
Count int `json:"count"`
}
// DomainStat holds domain statistics
type DomainStat struct {
Host string `json:"host"`
FeedsFound int `json:"feeds_found"`
}
// FeedInfo holds basic feed metadata for profile setup
type FeedInfo struct {
Title string
Description string
SiteURL string
SourceHost string
}
+242
View File
@@ -0,0 +1,242 @@
package shared
import (
"net/url"
"strings"
)
// NormalizeURL strips scheme (http/https) and www. prefix to save storage space.
// The normalized URL can be reconstructed with https:// for fetching.
func NormalizeURL(rawURL string) string {
// Remove scheme
u := rawURL
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
}
// Remove www. prefix
if strings.HasPrefix(u, "www.") {
u = u[4:]
}
return u
}
// NormalizeHost strips www. prefix from a hostname for canonical storage
func NormalizeHost(host string) string {
if strings.HasPrefix(host, "www.") {
return host[4:]
}
return host
}
// ReverseHost converts a reverse domain notation back to normal
// e.g., "com.example.www" -> "www.example.com"
func ReverseHost(reverseHost string) string {
parts := strings.Split(reverseHost, ".")
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
parts[i], parts[j] = parts[j], parts[i]
}
return strings.Join(parts, ".")
}
// GetTLD extracts the TLD from a hostname
func GetTLD(host string) string {
parts := strings.Split(host, ".")
if len(parts) > 0 {
return parts[len(parts)-1]
}
return ""
}
// StripTLD removes the TLD suffix from a hostname
// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example"
func StripTLD(host string) string {
idx := strings.LastIndex(host, ".")
if idx > 0 {
return host[:idx]
}
return host
}
// GetDomainHost extracts the host part from a full domain (without TLD)
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
func GetDomainHost(domain string) string {
return StripTLD(domain)
}
// FullHost reconstructs the full hostname from host and tld
// e.g., ("example", "com") -> "example.com"
func FullHost(host, tld string) string {
if tld == "" {
return host
}
return host + "." + tld
}
// MakeAbsoluteURL resolves a relative URL against a base URL
func MakeAbsoluteURL(href, baseURL string) string {
base, err := url.Parse(baseURL)
if err != nil {
return href
}
link, err := url.Parse(href)
if err != nil {
return href
}
return base.ResolveReference(link).String()
}
// SearchQuery represents a parsed search with optional type prefix
type SearchQuery struct {
Type string // "all", "domain", "url", "title", "description", "item"
Pattern string // the search pattern (without prefix)
ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly)
// For "all" type searches that look like domains, these are populated for additional exact matching
DomainHost string // e.g., "npr" from "npr.org"
DomainTLD string // e.g., "org" from "npr.org"
}
// ParseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:"
// Returns SearchQuery with Type and Pattern
// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern),
//
// "url" (f:), "title" (t:), "description" (s:), "item" (i:)
func ParseSearchPrefix(query string) SearchQuery {
query = strings.TrimSpace(query)
if query == "" {
return SearchQuery{Type: "all", Pattern: ""}
}
// Check for prefixes (case-insensitive)
lower := strings.ToLower(query)
if strings.HasPrefix(lower, "a:") {
return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])}
}
if strings.HasPrefix(lower, "d:") {
return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])}
}
if strings.HasPrefix(lower, "f:") {
return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])}
}
if strings.HasPrefix(lower, "t:") {
return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])}
}
if strings.HasPrefix(lower, "s:") {
return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])}
}
if strings.HasPrefix(lower, "i:") {
return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])}
}
// For "all" type, check if pattern looks like a domain and extract host/tld
result := SearchQuery{Type: "all", Pattern: query}
if LooksLikeDomain(query) {
host, tld := ParseSearchTerm(query)
if tld != "" {
result.DomainHost = host
result.DomainTLD = tld
}
}
return result
}
// LooksLikeDomain checks if a query looks like a domain name
func LooksLikeDomain(query string) bool {
if query == "" || strings.Contains(query, " ") {
return false
}
// Must have at least one dot
lastDot := strings.LastIndex(query, ".")
if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 {
return false
}
// TLD must be 2-6 lowercase letters
tld := query[lastDot+1:]
if len(tld) < 2 || len(tld) > 6 {
return false
}
for _, c := range tld {
if c < 'a' || c > 'z' {
if c < 'A' || c > 'Z' {
return false
}
}
}
return true
}
// ParseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter.
// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them.
// Returns (hostPattern, tldFilter) where tldFilter may be empty.
func ParseSearchTerm(search string) (hostPattern, tldFilter string) {
search = strings.TrimSpace(search)
if search == "" {
return "", ""
}
// Check if search contains a dot
lastDot := strings.LastIndex(search, ".")
if lastDot == -1 || lastDot == len(search)-1 {
// No dot or ends with dot - treat as host-only search
return search, ""
}
// Extract potential TLD (part after last dot)
potentialTLD := strings.ToLower(search[lastDot+1:])
hostPart := search[:lastDot]
// Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs)
if len(potentialTLD) < 2 || len(potentialTLD) > 24 {
return search, ""
}
for _, c := range potentialTLD {
if c < 'a' || c > 'z' {
// Contains non-letter, not a TLD
return search, ""
}
}
// Looks like a valid TLD pattern
return hostPart, potentialTLD
}
// ShouldCrawl checks if a link should be crawled (same host as base)
func ShouldCrawl(link, baseURL string) bool {
linkURL, err := url.Parse(link)
if err != nil {
return false
}
baseURLParsed, err := url.Parse(baseURL)
if err != nil {
return false
}
return linkURL.Host == baseURLParsed.Host
}
// ShouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
func ShouldAutoSkipDomain(host string) bool {
// Never skip our own domain
if strings.HasSuffix(host, "1440.news") || host == "1440.news" {
return false
}
// Skip bare TLDs (no dot means it's just "com", "net", etc.)
if !strings.Contains(host, ".") {
return true
}
// Skip domains starting with a digit (spam pattern)
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
return true
}
// Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com")
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
return true
}
return false
}