Initial commit: shared Go module for 1440.news services
Contains: - db.go: Database connection wrapper with helper methods - models.go: Domain, Feed, Item, ShortURL, Click structs - util.go: URL normalization, TLD functions, search helpers - handle.go: AT Protocol handle derivation from feed URLs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,156 @@
|
|||||||
|
package shared
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
|
"github.com/jackc/pgx/v5/pgxpool"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DB wraps pgxpool.Pool with helper methods
|
||||||
|
type DB struct {
|
||||||
|
*pgxpool.Pool
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenDatabase connects to PostgreSQL using environment variables or connection string
|
||||||
|
func OpenDatabase(connString string) (*DB, error) {
|
||||||
|
fmt.Printf("Connecting to database...\n")
|
||||||
|
|
||||||
|
// If connection string not provided, try environment variables
|
||||||
|
if connString == "" {
|
||||||
|
connString = os.Getenv("DATABASE_URL")
|
||||||
|
}
|
||||||
|
if connString == "" {
|
||||||
|
// Build from individual env vars
|
||||||
|
host := GetEnvOrDefault("DB_HOST", "infra-postgres")
|
||||||
|
port := GetEnvOrDefault("DB_PORT", "5432")
|
||||||
|
user := GetEnvOrDefault("DB_USER", "dba_1440_news")
|
||||||
|
dbname := GetEnvOrDefault("DB_NAME", "db_1440_news")
|
||||||
|
|
||||||
|
// Support Docker secrets (password file) or direct password
|
||||||
|
password := os.Getenv("DB_PASSWORD")
|
||||||
|
if password == "" {
|
||||||
|
if passwordFile := os.Getenv("DB_PASSWORD_FILE"); passwordFile != "" {
|
||||||
|
data, err := os.ReadFile(passwordFile)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read password file: %v", err)
|
||||||
|
}
|
||||||
|
password = strings.TrimSpace(string(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
connString = fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=disable",
|
||||||
|
user, url.QueryEscape(password), host, port, dbname)
|
||||||
|
}
|
||||||
|
|
||||||
|
config, err := pgxpool.ParseConfig(connString)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse connection string: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connection pool settings
|
||||||
|
config.MaxConns = 10
|
||||||
|
config.MinConns = 0 // Don't pre-create connections to avoid schema race conditions
|
||||||
|
config.MaxConnLifetime = 5 * time.Minute
|
||||||
|
config.MaxConnIdleTime = 1 * time.Minute
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
pool, err := pgxpool.NewWithConfig(ctx, config)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to connect to database: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify connection
|
||||||
|
if err := pool.Ping(ctx); err != nil {
|
||||||
|
pool.Close()
|
||||||
|
return nil, fmt.Errorf("failed to ping database: %v", err)
|
||||||
|
}
|
||||||
|
fmt.Println(" Connected to PostgreSQL")
|
||||||
|
|
||||||
|
return &DB{pool}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetEnvOrDefault returns environment variable value or default
|
||||||
|
func GetEnvOrDefault(key, defaultVal string) string {
|
||||||
|
if val := os.Getenv(key); val != "" {
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
return defaultVal
|
||||||
|
}
|
||||||
|
|
||||||
|
// QueryRow wraps pool.QueryRow for compatibility
|
||||||
|
func (db *DB) QueryRow(query string, args ...interface{}) pgx.Row {
|
||||||
|
return db.Pool.QueryRow(context.Background(), query, args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query wraps pool.Query for compatibility
|
||||||
|
func (db *DB) Query(query string, args ...interface{}) (pgx.Rows, error) {
|
||||||
|
return db.Pool.Query(context.Background(), query, args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exec wraps pool.Exec for compatibility
|
||||||
|
func (db *DB) Exec(query string, args ...interface{}) (int64, error) {
|
||||||
|
result, err := db.Pool.Exec(context.Background(), query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return result.RowsAffected(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Begin starts a transaction
|
||||||
|
func (db *DB) Begin() (pgx.Tx, error) {
|
||||||
|
return db.Pool.Begin(context.Background())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close closes the connection pool
|
||||||
|
func (db *DB) Close() error {
|
||||||
|
db.Pool.Close()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NullableString returns nil for empty strings, otherwise the string pointer
|
||||||
|
func NullableString(s string) *string {
|
||||||
|
if s == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &s
|
||||||
|
}
|
||||||
|
|
||||||
|
// NullableTime returns nil for zero times, otherwise the time pointer
|
||||||
|
func NullableTime(t time.Time) *time.Time {
|
||||||
|
if t.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &t
|
||||||
|
}
|
||||||
|
|
||||||
|
// StringValue returns empty string for nil, otherwise the dereferenced value
|
||||||
|
func StringValue(s *string) string {
|
||||||
|
if s == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return *s
|
||||||
|
}
|
||||||
|
|
||||||
|
// TimeValue returns zero time for nil, otherwise the dereferenced value
|
||||||
|
func TimeValue(t *time.Time) time.Time {
|
||||||
|
if t == nil {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
return *t
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToSearchQuery converts a user query to PostgreSQL tsquery format
|
||||||
|
func ToSearchQuery(query string) string {
|
||||||
|
// Simple conversion: split on spaces and join with &
|
||||||
|
words := strings.Fields(query)
|
||||||
|
if len(words) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.Join(words, " & ")
|
||||||
|
}
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
module github.com/1440news/shared
|
||||||
|
|
||||||
|
go 1.24.0
|
||||||
|
|
||||||
|
require github.com/jackc/pgx/v5 v5.7.5
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||||
|
golang.org/x/crypto v0.47.0 // indirect
|
||||||
|
golang.org/x/sync v0.19.0 // indirect
|
||||||
|
golang.org/x/text v0.33.0 // indirect
|
||||||
|
)
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||||
|
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||||
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
||||||
|
github.com/jackc/pgx/v5 v5.7.5 h1:JHGfMnQY+IEtGM63d+NGMjoRpysB2JBwDr5fsngwmJs=
|
||||||
|
github.com/jackc/pgx/v5 v5.7.5/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||||
|
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||||
|
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||||
|
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
|
||||||
|
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
|
||||||
|
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||||
|
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||||
|
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
|
||||||
|
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
@@ -0,0 +1,262 @@
|
|||||||
|
package shared
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
|
||||||
|
// Format: {domain}-{category}.1440.news
|
||||||
|
// AT Protocol allows up to 63 characters per label, but the PDS
|
||||||
|
// restricts the first segment to 18 characters for local handles.
|
||||||
|
// Examples:
|
||||||
|
//
|
||||||
|
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
|
||||||
|
// news.ycombinator.com/rss → ycombinator.1440.news
|
||||||
|
func DeriveHandleFromFeed(feedURL string) string {
|
||||||
|
const maxSubdomainLen = 18 // PDS limit for first segment
|
||||||
|
|
||||||
|
// Ensure we have a scheme for parsing
|
||||||
|
if !strings.Contains(feedURL, "://") {
|
||||||
|
feedURL = "https://" + feedURL
|
||||||
|
}
|
||||||
|
|
||||||
|
u, err := url.Parse(feedURL)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
hostname := strings.ToLower(u.Hostname())
|
||||||
|
path := strings.ToLower(u.Path)
|
||||||
|
|
||||||
|
// Remove common feed suffixes/extensions
|
||||||
|
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
|
||||||
|
for _, suffix := range suffixesToRemove {
|
||||||
|
path = strings.TrimSuffix(path, suffix)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split path into segments and filter noise
|
||||||
|
segments := strings.Split(strings.Trim(path, "/"), "/")
|
||||||
|
skipPathWords := map[string]bool{
|
||||||
|
"rss": true, "feed": true, "feeds": true, "atom": true,
|
||||||
|
"xml": true, "default": true, "index": true, "services": true,
|
||||||
|
"nyt": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
var pathParts []string
|
||||||
|
for _, seg := range segments {
|
||||||
|
seg = cleanHandleSegment(seg)
|
||||||
|
if seg != "" && !skipPathWords[seg] {
|
||||||
|
pathParts = append(pathParts, seg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split hostname and extract the meaningful domain
|
||||||
|
hostParts := strings.Split(hostname, ".")
|
||||||
|
|
||||||
|
// Two-part TLDs to handle specially
|
||||||
|
twoPartTLDs := map[string]bool{
|
||||||
|
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
|
||||||
|
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for two-part TLD
|
||||||
|
if len(hostParts) >= 2 {
|
||||||
|
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
|
||||||
|
if twoPartTLDs[possibleTwoPartTLD] {
|
||||||
|
hostParts = hostParts[:len(hostParts)-2]
|
||||||
|
} else {
|
||||||
|
// Single TLD - remove it
|
||||||
|
singleTLDs := map[string]bool{
|
||||||
|
"com": true, "org": true, "net": true, "io": true,
|
||||||
|
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
|
||||||
|
}
|
||||||
|
if singleTLDs[hostParts[len(hostParts)-1]] {
|
||||||
|
hostParts = hostParts[:len(hostParts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip noise subdomains
|
||||||
|
skipHostWords := map[string]bool{
|
||||||
|
"www": true, "feeds": true, "rss": true, "feed": true,
|
||||||
|
"api": true, "cdn": true, "static": true, "news": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
var meaningfulHostParts []string
|
||||||
|
for _, part := range hostParts {
|
||||||
|
if !skipHostWords[part] && part != "" {
|
||||||
|
meaningfulHostParts = append(meaningfulHostParts, part)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
|
||||||
|
var mainDomain string
|
||||||
|
if len(meaningfulHostParts) > 0 {
|
||||||
|
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
|
||||||
|
} else if len(hostParts) > 0 {
|
||||||
|
mainDomain = hostParts[len(hostParts)-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case: "bbci" should become "bbc"
|
||||||
|
if mainDomain == "bbci" {
|
||||||
|
mainDomain = "bbc"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Abbreviations for long category names to fit 18-char limit
|
||||||
|
categoryAbbrevs := map[string]string{
|
||||||
|
"science-and-environment": "sci-env",
|
||||||
|
"entertainment-and-arts": "ent-arts",
|
||||||
|
"science-environment": "sci-env",
|
||||||
|
"entertainment-arts": "ent-arts",
|
||||||
|
"technology": "tech",
|
||||||
|
"business": "biz",
|
||||||
|
"international": "intl",
|
||||||
|
"environment": "env",
|
||||||
|
"entertainment": "ent",
|
||||||
|
"politics": "pol",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build subdomain: domain + category (from path)
|
||||||
|
var subdomain string
|
||||||
|
if len(pathParts) > 0 {
|
||||||
|
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
|
||||||
|
category := pathParts[len(pathParts)-1]
|
||||||
|
// Skip generic categories
|
||||||
|
if category == "news" && len(pathParts) == 1 {
|
||||||
|
subdomain = mainDomain
|
||||||
|
} else {
|
||||||
|
// Try to abbreviate if the full subdomain would be too long
|
||||||
|
fullSubdomain := mainDomain + "-" + category
|
||||||
|
if len(fullSubdomain) > maxSubdomainLen {
|
||||||
|
if abbrev, ok := categoryAbbrevs[category]; ok {
|
||||||
|
category = abbrev
|
||||||
|
}
|
||||||
|
}
|
||||||
|
subdomain = mainDomain + "-" + category
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
subdomain = mainDomain
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still too long, just use main hostname
|
||||||
|
if len(subdomain) > maxSubdomainLen {
|
||||||
|
subdomain = mainDomain
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final safety: truncate if still too long
|
||||||
|
if len(subdomain) > maxSubdomainLen {
|
||||||
|
subdomain = subdomain[:maxSubdomainLen]
|
||||||
|
}
|
||||||
|
|
||||||
|
subdomain = strings.Trim(subdomain, "-")
|
||||||
|
|
||||||
|
// Collapse multiple hyphens
|
||||||
|
for strings.Contains(subdomain, "--") {
|
||||||
|
subdomain = strings.ReplaceAll(subdomain, "--", "-")
|
||||||
|
}
|
||||||
|
|
||||||
|
return subdomain + ".1440.news"
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
|
||||||
|
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
|
||||||
|
func cleanHandleSegment(s string) string {
|
||||||
|
// Remove file extensions
|
||||||
|
if idx := strings.LastIndex(s, "."); idx > 0 {
|
||||||
|
s = s[:idx]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to lowercase
|
||||||
|
s = strings.ToLower(s)
|
||||||
|
|
||||||
|
// Strip common feed prefixes/suffixes from the segment itself
|
||||||
|
// e.g., "showrss" → "show", "rssworld" → "world"
|
||||||
|
feedAffixes := []string{"rss", "feed", "atom", "xml"}
|
||||||
|
for _, affix := range feedAffixes {
|
||||||
|
// Strip suffix (e.g., "showrss" → "show")
|
||||||
|
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
|
||||||
|
s = strings.TrimSuffix(s, affix)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Strip prefix (e.g., "rssworld" → "world")
|
||||||
|
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
|
||||||
|
s = strings.TrimPrefix(s, affix)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace underscores and other separators with hyphens
|
||||||
|
s = strings.ReplaceAll(s, "_", "-")
|
||||||
|
s = strings.ReplaceAll(s, " ", "-")
|
||||||
|
|
||||||
|
// Remove any characters that aren't alphanumeric or hyphens
|
||||||
|
reg := regexp.MustCompile(`[^a-z0-9-]`)
|
||||||
|
s = reg.ReplaceAllString(s, "")
|
||||||
|
|
||||||
|
// Collapse multiple hyphens
|
||||||
|
for strings.Contains(s, "--") {
|
||||||
|
s = strings.ReplaceAll(s, "--", "-")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim leading/trailing hyphens
|
||||||
|
s = strings.Trim(s, "-")
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// SplitHandle extracts the path prefix and hostname from a derived handle
|
||||||
|
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
|
||||||
|
func SplitHandle(handle string) (prefix string, hostname string) {
|
||||||
|
// Remove .1440.news suffix
|
||||||
|
handle = strings.TrimSuffix(handle, ".1440.news")
|
||||||
|
|
||||||
|
parts := strings.Split(handle, ".")
|
||||||
|
|
||||||
|
// Try to find where hostname starts by looking for valid hostname patterns
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
for i := 0; i < len(parts)-1; i++ {
|
||||||
|
remaining := strings.Join(parts[i:], ".")
|
||||||
|
if looksLikeHostname(remaining) {
|
||||||
|
if i > 0 {
|
||||||
|
prefix = strings.Join(parts[:i], ".")
|
||||||
|
}
|
||||||
|
hostname = remaining
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: no prefix, entire thing is hostname
|
||||||
|
hostname = handle
|
||||||
|
return "", hostname
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyTLDPart(s string) bool {
|
||||||
|
tlds := map[string]bool{
|
||||||
|
"com": true, "org": true, "net": true, "edu": true, "gov": true,
|
||||||
|
"io": true, "co": true, "uk": true, "de": true, "fr": true,
|
||||||
|
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
|
||||||
|
"news": true, "blog": true, "tech": true, "dev": true,
|
||||||
|
}
|
||||||
|
return tlds[s]
|
||||||
|
}
|
||||||
|
|
||||||
|
func isTwoPartTLD(first, second string) bool {
|
||||||
|
twoPartTLDs := map[string]bool{
|
||||||
|
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
|
||||||
|
"org.uk": true, "net.au": true, "com.br": true,
|
||||||
|
}
|
||||||
|
return twoPartTLDs[first+"."+second]
|
||||||
|
}
|
||||||
|
|
||||||
|
func looksLikeHostname(s string) bool {
|
||||||
|
// A hostname typically has at least one dot and ends with a TLD-like part
|
||||||
|
parts := strings.Split(s, ".")
|
||||||
|
if len(parts) < 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
lastPart := parts[len(parts)-1]
|
||||||
|
return isLikelyTLDPart(lastPart)
|
||||||
|
}
|
||||||
@@ -0,0 +1,178 @@
|
|||||||
|
package shared
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Domain represents a host to process for feeds
|
||||||
|
// Status: hold (pending review), pass (approved), skip (not processing), dead (retired TLD)
|
||||||
|
// CrawledAt: zero time = needs domain_check, +1 sec = needs feed_crawl, real time = done
|
||||||
|
type Domain struct {
|
||||||
|
Host string `json:"host"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CrawledAt time.Time `json:"crawled_at"`
|
||||||
|
FeedsFound int `json:"feeds_found,omitempty"`
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
TLD string `json:"tld,omitempty"`
|
||||||
|
MissCount int `json:"miss_count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// MissCountThreshold is the number of consecutive errors before setting status to hold
|
||||||
|
const MissCountThreshold = 100
|
||||||
|
|
||||||
|
// Sentinel values for domain processing state
|
||||||
|
var (
|
||||||
|
DomainStateUnchecked = time.Time{} // 0001-01-01 00:00:00 - needs domain_check
|
||||||
|
DomainStateChecked = time.Time{}.Add(time.Second) // 0001-01-01 00:00:01 - needs feed_crawl
|
||||||
|
)
|
||||||
|
|
||||||
|
// FullHost returns the complete hostname (host + tld)
|
||||||
|
func (d *Domain) FullHost() string {
|
||||||
|
return FullHost(d.Host, d.TLD)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Feed represents a discovered RSS/Atom feed with metadata
|
||||||
|
type Feed struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Type string `json:"type"` // "rss", "atom", "json", or "unknown"
|
||||||
|
Category string `json:"category"` // "main", "comments", "category", "author", "article", "podcast"
|
||||||
|
Title string `json:"title,omitempty"`
|
||||||
|
Description string `json:"description,omitempty"`
|
||||||
|
Language string `json:"language,omitempty"`
|
||||||
|
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
|
||||||
|
|
||||||
|
// Timing
|
||||||
|
DiscoveredAt time.Time `json:"discovered_at"`
|
||||||
|
LastCheckedAt time.Time `json:"last_checked_at,omitempty"` // feed_check: when last checked
|
||||||
|
NextCheckAt time.Time `json:"next_check_at,omitempty"` // feed_check: when to next check
|
||||||
|
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
||||||
|
|
||||||
|
// Cache headers for conditional requests
|
||||||
|
ETag string `json:"etag,omitempty"`
|
||||||
|
LastModified string `json:"last_modified,omitempty"`
|
||||||
|
|
||||||
|
// Health tracking
|
||||||
|
Status string `json:"status"` // "pass", "hold", "skip"
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
LastErrorAt time.Time `json:"last_error_at,omitempty"`
|
||||||
|
|
||||||
|
// Discovery source
|
||||||
|
SourceURL string `json:"source_url,omitempty"`
|
||||||
|
DomainHost string `json:"domain_host,omitempty"`
|
||||||
|
DomainTLD string `json:"domain_tld,omitempty"`
|
||||||
|
|
||||||
|
// Content stats
|
||||||
|
ItemCount int `json:"item_count,omitempty"` // Number of items in last feed_check
|
||||||
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
||||||
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
||||||
|
|
||||||
|
// Adaptive check interval
|
||||||
|
NoUpdate int `json:"no_update"` // Consecutive checks with no change
|
||||||
|
|
||||||
|
// Publishing to PDS
|
||||||
|
PublishStatus string `json:"publish_status"` // "hold", "pass", "skip"
|
||||||
|
PublishAccount string `json:"publish_account,omitempty"` // e.g., "news.ycombinator.com.1440.news"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enclosure represents a media attachment (audio, video, image)
|
||||||
|
type Enclosure struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Type string `json:"type"` // MIME type (audio/mpeg, image/jpeg, etc.)
|
||||||
|
Length int64 `json:"length"` // Size in bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
// Item represents an individual entry/article from a feed
|
||||||
|
type Item struct {
|
||||||
|
FeedURL string `json:"feed_url"`
|
||||||
|
GUID string `json:"guid,omitempty"`
|
||||||
|
Title string `json:"title,omitempty"`
|
||||||
|
Link string `json:"link,omitempty"`
|
||||||
|
Description string `json:"description,omitempty"`
|
||||||
|
Content string `json:"content,omitempty"`
|
||||||
|
Author string `json:"author,omitempty"`
|
||||||
|
PubDate time.Time `json:"pub_date,omitempty"`
|
||||||
|
DiscoveredAt time.Time `json:"discovered_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at,omitempty"`
|
||||||
|
|
||||||
|
// Media attachments
|
||||||
|
Enclosure *Enclosure `json:"enclosure,omitempty"` // Primary enclosure (podcast audio, etc.)
|
||||||
|
ImageURLs []string `json:"image_urls,omitempty"` // Image URLs extracted from content
|
||||||
|
Tags []string `json:"tags,omitempty"` // Category/tag strings from feed
|
||||||
|
|
||||||
|
// Publishing to PDS
|
||||||
|
PublishedAt time.Time `json:"published_at,omitempty"`
|
||||||
|
PublishedUri string `json:"published_uri,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ShortURL represents a shortened URL mapping
|
||||||
|
type ShortURL struct {
|
||||||
|
Code string `json:"code"`
|
||||||
|
OriginalURL string `json:"original_url"`
|
||||||
|
ItemGUID string `json:"item_guid,omitempty"`
|
||||||
|
FeedURL string `json:"feed_url,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
ClickCount int `json:"click_count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Click represents a click event on a short URL
|
||||||
|
type Click struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
ShortCode string `json:"short_code"`
|
||||||
|
ClickedAt time.Time `json:"clicked_at"`
|
||||||
|
Referrer string `json:"referrer,omitempty"`
|
||||||
|
UserAgent string `json:"user_agent,omitempty"`
|
||||||
|
IPHash string `json:"ip_hash,omitempty"`
|
||||||
|
Country string `json:"country,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DashboardStats holds all statistics for the dashboard
|
||||||
|
type DashboardStats struct {
|
||||||
|
// Domain stats
|
||||||
|
TotalDomains int `json:"total_domains"`
|
||||||
|
HoldDomains int `json:"hold_domains"`
|
||||||
|
PassDomains int `json:"pass_domains"`
|
||||||
|
SkipDomains int `json:"skip_domains"`
|
||||||
|
DeadDomains int `json:"dead_domains"`
|
||||||
|
|
||||||
|
// Feed stats
|
||||||
|
TotalFeeds int `json:"total_feeds"`
|
||||||
|
AliveFeeds int `json:"alive_feeds"` // status='pass' (healthy feeds)
|
||||||
|
PublishFeeds int `json:"publish_feeds"` // publish_status='pass' (approved for publishing)
|
||||||
|
SkipFeeds int `json:"skip_feeds"`
|
||||||
|
HoldFeeds int `json:"hold_feeds"`
|
||||||
|
DeadFeeds int `json:"dead_feeds"`
|
||||||
|
EmptyFeeds int `json:"empty_feeds"`
|
||||||
|
RSSFeeds int `json:"rss_feeds"`
|
||||||
|
AtomFeeds int `json:"atom_feeds"`
|
||||||
|
JSONFeeds int `json:"json_feeds"`
|
||||||
|
UnknownFeeds int `json:"unknown_feeds"`
|
||||||
|
|
||||||
|
// Processing rates (per minute)
|
||||||
|
DomainsCrawled int32 `json:"domains_crawled"` // feed_crawl count
|
||||||
|
DomainCheckRate int `json:"domain_check_rate"` // domain_check per minute
|
||||||
|
FeedCrawlRate int `json:"feed_crawl_rate"` // feed_crawl per minute
|
||||||
|
FeedCheckRate int `json:"feed_check_rate"` // feed_check per minute
|
||||||
|
|
||||||
|
// Timing
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TLDStat holds TLD statistics
|
||||||
|
type TLDStat struct {
|
||||||
|
TLD string `json:"tld"`
|
||||||
|
Count int `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DomainStat holds domain statistics
|
||||||
|
type DomainStat struct {
|
||||||
|
Host string `json:"host"`
|
||||||
|
FeedsFound int `json:"feeds_found"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// FeedInfo holds basic feed metadata for profile setup
|
||||||
|
type FeedInfo struct {
|
||||||
|
Title string
|
||||||
|
Description string
|
||||||
|
SiteURL string
|
||||||
|
SourceHost string
|
||||||
|
}
|
||||||
@@ -0,0 +1,242 @@
|
|||||||
|
package shared
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NormalizeURL strips scheme (http/https) and www. prefix to save storage space.
|
||||||
|
// The normalized URL can be reconstructed with https:// for fetching.
|
||||||
|
func NormalizeURL(rawURL string) string {
|
||||||
|
// Remove scheme
|
||||||
|
u := rawURL
|
||||||
|
if strings.HasPrefix(u, "https://") {
|
||||||
|
u = u[8:]
|
||||||
|
} else if strings.HasPrefix(u, "http://") {
|
||||||
|
u = u[7:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove www. prefix
|
||||||
|
if strings.HasPrefix(u, "www.") {
|
||||||
|
u = u[4:]
|
||||||
|
}
|
||||||
|
|
||||||
|
return u
|
||||||
|
}
|
||||||
|
|
||||||
|
// NormalizeHost strips www. prefix from a hostname for canonical storage
|
||||||
|
func NormalizeHost(host string) string {
|
||||||
|
if strings.HasPrefix(host, "www.") {
|
||||||
|
return host[4:]
|
||||||
|
}
|
||||||
|
return host
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReverseHost converts a reverse domain notation back to normal
|
||||||
|
// e.g., "com.example.www" -> "www.example.com"
|
||||||
|
func ReverseHost(reverseHost string) string {
|
||||||
|
parts := strings.Split(reverseHost, ".")
|
||||||
|
for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 {
|
||||||
|
parts[i], parts[j] = parts[j], parts[i]
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ".")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetTLD extracts the TLD from a hostname
|
||||||
|
func GetTLD(host string) string {
|
||||||
|
parts := strings.Split(host, ".")
|
||||||
|
if len(parts) > 0 {
|
||||||
|
return parts[len(parts)-1]
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// StripTLD removes the TLD suffix from a hostname
|
||||||
|
// e.g., "example.com" -> "example", "sub.example.com" -> "sub.example"
|
||||||
|
func StripTLD(host string) string {
|
||||||
|
idx := strings.LastIndex(host, ".")
|
||||||
|
if idx > 0 {
|
||||||
|
return host[:idx]
|
||||||
|
}
|
||||||
|
return host
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDomainHost extracts the host part from a full domain (without TLD)
|
||||||
|
// e.g., "npr.org" -> "npr", "bbc.co.uk" -> "bbc.co"
|
||||||
|
func GetDomainHost(domain string) string {
|
||||||
|
return StripTLD(domain)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FullHost reconstructs the full hostname from host and tld
|
||||||
|
// e.g., ("example", "com") -> "example.com"
|
||||||
|
func FullHost(host, tld string) string {
|
||||||
|
if tld == "" {
|
||||||
|
return host
|
||||||
|
}
|
||||||
|
return host + "." + tld
|
||||||
|
}
|
||||||
|
|
||||||
|
// MakeAbsoluteURL resolves a relative URL against a base URL
|
||||||
|
func MakeAbsoluteURL(href, baseURL string) string {
|
||||||
|
base, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
return href
|
||||||
|
}
|
||||||
|
|
||||||
|
link, err := url.Parse(href)
|
||||||
|
if err != nil {
|
||||||
|
return href
|
||||||
|
}
|
||||||
|
|
||||||
|
return base.ResolveReference(link).String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchQuery represents a parsed search with optional type prefix
|
||||||
|
type SearchQuery struct {
|
||||||
|
Type string // "all", "domain", "url", "title", "description", "item"
|
||||||
|
Pattern string // the search pattern (without prefix)
|
||||||
|
ExactMatch bool // for domain searches: true if TLD was specified (d:npr.org matches exactly)
|
||||||
|
// For "all" type searches that look like domains, these are populated for additional exact matching
|
||||||
|
DomainHost string // e.g., "npr" from "npr.org"
|
||||||
|
DomainTLD string // e.g., "org" from "npr.org"
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseSearchPrefix parses search prefixes like "a:", "d:", "f:", "t:", "s:", "i:"
|
||||||
|
// Returns SearchQuery with Type and Pattern
|
||||||
|
// Types: "all" (default or a: prefix), "domain" (d:, extracts TLD from pattern),
|
||||||
|
//
|
||||||
|
// "url" (f:), "title" (t:), "description" (s:), "item" (i:)
|
||||||
|
func ParseSearchPrefix(query string) SearchQuery {
|
||||||
|
query = strings.TrimSpace(query)
|
||||||
|
if query == "" {
|
||||||
|
return SearchQuery{Type: "all", Pattern: ""}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for prefixes (case-insensitive)
|
||||||
|
lower := strings.ToLower(query)
|
||||||
|
if strings.HasPrefix(lower, "a:") {
|
||||||
|
return SearchQuery{Type: "all", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(lower, "d:") {
|
||||||
|
return SearchQuery{Type: "domain", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(lower, "f:") {
|
||||||
|
return SearchQuery{Type: "url", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(lower, "t:") {
|
||||||
|
return SearchQuery{Type: "title", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(lower, "s:") {
|
||||||
|
return SearchQuery{Type: "description", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(lower, "i:") {
|
||||||
|
return SearchQuery{Type: "item", Pattern: strings.TrimSpace(query[2:])}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For "all" type, check if pattern looks like a domain and extract host/tld
|
||||||
|
result := SearchQuery{Type: "all", Pattern: query}
|
||||||
|
if LooksLikeDomain(query) {
|
||||||
|
host, tld := ParseSearchTerm(query)
|
||||||
|
if tld != "" {
|
||||||
|
result.DomainHost = host
|
||||||
|
result.DomainTLD = tld
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// LooksLikeDomain checks if a query looks like a domain name
|
||||||
|
func LooksLikeDomain(query string) bool {
|
||||||
|
if query == "" || strings.Contains(query, " ") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Must have at least one dot
|
||||||
|
lastDot := strings.LastIndex(query, ".")
|
||||||
|
if lastDot == -1 || lastDot == 0 || lastDot == len(query)-1 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// TLD must be 2-6 lowercase letters
|
||||||
|
tld := query[lastDot+1:]
|
||||||
|
if len(tld) < 2 || len(tld) > 6 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, c := range tld {
|
||||||
|
if c < 'a' || c > 'z' {
|
||||||
|
if c < 'A' || c > 'Z' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseSearchTerm analyzes a search query and extracts host pattern and optional TLD filter.
|
||||||
|
// If the search ends with what looks like a TLD (e.g., "example.com"), it splits them.
|
||||||
|
// Returns (hostPattern, tldFilter) where tldFilter may be empty.
|
||||||
|
func ParseSearchTerm(search string) (hostPattern, tldFilter string) {
|
||||||
|
search = strings.TrimSpace(search)
|
||||||
|
if search == "" {
|
||||||
|
return "", ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if search contains a dot
|
||||||
|
lastDot := strings.LastIndex(search, ".")
|
||||||
|
if lastDot == -1 || lastDot == len(search)-1 {
|
||||||
|
// No dot or ends with dot - treat as host-only search
|
||||||
|
return search, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract potential TLD (part after last dot)
|
||||||
|
potentialTLD := strings.ToLower(search[lastDot+1:])
|
||||||
|
hostPart := search[:lastDot]
|
||||||
|
|
||||||
|
// Validate TLD: must be 2-24 lowercase letters (covers all IANA TLDs)
|
||||||
|
if len(potentialTLD) < 2 || len(potentialTLD) > 24 {
|
||||||
|
return search, ""
|
||||||
|
}
|
||||||
|
for _, c := range potentialTLD {
|
||||||
|
if c < 'a' || c > 'z' {
|
||||||
|
// Contains non-letter, not a TLD
|
||||||
|
return search, ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Looks like a valid TLD pattern
|
||||||
|
return hostPart, potentialTLD
|
||||||
|
}
|
||||||
|
|
||||||
|
// ShouldCrawl checks if a link should be crawled (same host as base)
|
||||||
|
func ShouldCrawl(link, baseURL string) bool {
|
||||||
|
linkURL, err := url.Parse(link)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
baseURLParsed, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return linkURL.Host == baseURLParsed.Host
|
||||||
|
}
|
||||||
|
|
||||||
|
// ShouldAutoSkipDomain checks if a domain should be auto-skipped based on patterns
|
||||||
|
func ShouldAutoSkipDomain(host string) bool {
|
||||||
|
// Never skip our own domain
|
||||||
|
if strings.HasSuffix(host, "1440.news") || host == "1440.news" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Skip bare TLDs (no dot means it's just "com", "net", etc.)
|
||||||
|
if !strings.Contains(host, ".") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Skip domains starting with a digit (spam pattern)
|
||||||
|
if len(host) > 0 && host[0] >= '0' && host[0] <= '9' {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Skip domains starting with letter-dash (spam pattern, e.g., "a-example.com")
|
||||||
|
if len(host) > 1 && ((host[0] >= 'a' && host[0] <= 'z') || (host[0] >= 'A' && host[0] <= 'Z')) && host[1] == '-' {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user