Contains: - db.go: Database connection wrapper with helper methods - models.go: Domain, Feed, Item, ShortURL, Click structs - util.go: URL normalization, TLD functions, search helpers - handle.go: AT Protocol handle derivation from feed URLs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
263 lines
7.4 KiB
Go
263 lines
7.4 KiB
Go
package shared
|
|
|
|
import (
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
|
|
// Format: {domain}-{category}.1440.news
|
|
// AT Protocol allows up to 63 characters per label, but the PDS
|
|
// restricts the first segment to 18 characters for local handles.
|
|
// Examples:
|
|
//
|
|
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
|
|
// news.ycombinator.com/rss → ycombinator.1440.news
|
|
func DeriveHandleFromFeed(feedURL string) string {
|
|
const maxSubdomainLen = 18 // PDS limit for first segment
|
|
|
|
// Ensure we have a scheme for parsing
|
|
if !strings.Contains(feedURL, "://") {
|
|
feedURL = "https://" + feedURL
|
|
}
|
|
|
|
u, err := url.Parse(feedURL)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
hostname := strings.ToLower(u.Hostname())
|
|
path := strings.ToLower(u.Path)
|
|
|
|
// Remove common feed suffixes/extensions
|
|
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
|
|
for _, suffix := range suffixesToRemove {
|
|
path = strings.TrimSuffix(path, suffix)
|
|
}
|
|
|
|
// Split path into segments and filter noise
|
|
segments := strings.Split(strings.Trim(path, "/"), "/")
|
|
skipPathWords := map[string]bool{
|
|
"rss": true, "feed": true, "feeds": true, "atom": true,
|
|
"xml": true, "default": true, "index": true, "services": true,
|
|
"nyt": true,
|
|
}
|
|
|
|
var pathParts []string
|
|
for _, seg := range segments {
|
|
seg = cleanHandleSegment(seg)
|
|
if seg != "" && !skipPathWords[seg] {
|
|
pathParts = append(pathParts, seg)
|
|
}
|
|
}
|
|
|
|
// Split hostname and extract the meaningful domain
|
|
hostParts := strings.Split(hostname, ".")
|
|
|
|
// Two-part TLDs to handle specially
|
|
twoPartTLDs := map[string]bool{
|
|
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
|
|
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
|
|
}
|
|
|
|
// Check for two-part TLD
|
|
if len(hostParts) >= 2 {
|
|
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
|
|
if twoPartTLDs[possibleTwoPartTLD] {
|
|
hostParts = hostParts[:len(hostParts)-2]
|
|
} else {
|
|
// Single TLD - remove it
|
|
singleTLDs := map[string]bool{
|
|
"com": true, "org": true, "net": true, "io": true,
|
|
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
|
|
}
|
|
if singleTLDs[hostParts[len(hostParts)-1]] {
|
|
hostParts = hostParts[:len(hostParts)-1]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Skip noise subdomains
|
|
skipHostWords := map[string]bool{
|
|
"www": true, "feeds": true, "rss": true, "feed": true,
|
|
"api": true, "cdn": true, "static": true, "news": true,
|
|
}
|
|
|
|
var meaningfulHostParts []string
|
|
for _, part := range hostParts {
|
|
if !skipHostWords[part] && part != "" {
|
|
meaningfulHostParts = append(meaningfulHostParts, part)
|
|
}
|
|
}
|
|
|
|
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
|
|
var mainDomain string
|
|
if len(meaningfulHostParts) > 0 {
|
|
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
|
|
} else if len(hostParts) > 0 {
|
|
mainDomain = hostParts[len(hostParts)-1]
|
|
}
|
|
|
|
// Special case: "bbci" should become "bbc"
|
|
if mainDomain == "bbci" {
|
|
mainDomain = "bbc"
|
|
}
|
|
|
|
// Abbreviations for long category names to fit 18-char limit
|
|
categoryAbbrevs := map[string]string{
|
|
"science-and-environment": "sci-env",
|
|
"entertainment-and-arts": "ent-arts",
|
|
"science-environment": "sci-env",
|
|
"entertainment-arts": "ent-arts",
|
|
"technology": "tech",
|
|
"business": "biz",
|
|
"international": "intl",
|
|
"environment": "env",
|
|
"entertainment": "ent",
|
|
"politics": "pol",
|
|
}
|
|
|
|
// Build subdomain: domain + category (from path)
|
|
var subdomain string
|
|
if len(pathParts) > 0 {
|
|
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
|
|
category := pathParts[len(pathParts)-1]
|
|
// Skip generic categories
|
|
if category == "news" && len(pathParts) == 1 {
|
|
subdomain = mainDomain
|
|
} else {
|
|
// Try to abbreviate if the full subdomain would be too long
|
|
fullSubdomain := mainDomain + "-" + category
|
|
if len(fullSubdomain) > maxSubdomainLen {
|
|
if abbrev, ok := categoryAbbrevs[category]; ok {
|
|
category = abbrev
|
|
}
|
|
}
|
|
subdomain = mainDomain + "-" + category
|
|
}
|
|
} else {
|
|
subdomain = mainDomain
|
|
}
|
|
|
|
// If still too long, just use main hostname
|
|
if len(subdomain) > maxSubdomainLen {
|
|
subdomain = mainDomain
|
|
}
|
|
|
|
// Final safety: truncate if still too long
|
|
if len(subdomain) > maxSubdomainLen {
|
|
subdomain = subdomain[:maxSubdomainLen]
|
|
}
|
|
|
|
subdomain = strings.Trim(subdomain, "-")
|
|
|
|
// Collapse multiple hyphens
|
|
for strings.Contains(subdomain, "--") {
|
|
subdomain = strings.ReplaceAll(subdomain, "--", "-")
|
|
}
|
|
|
|
return subdomain + ".1440.news"
|
|
}
|
|
|
|
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
|
|
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
|
|
func cleanHandleSegment(s string) string {
|
|
// Remove file extensions
|
|
if idx := strings.LastIndex(s, "."); idx > 0 {
|
|
s = s[:idx]
|
|
}
|
|
|
|
// Convert to lowercase
|
|
s = strings.ToLower(s)
|
|
|
|
// Strip common feed prefixes/suffixes from the segment itself
|
|
// e.g., "showrss" → "show", "rssworld" → "world"
|
|
feedAffixes := []string{"rss", "feed", "atom", "xml"}
|
|
for _, affix := range feedAffixes {
|
|
// Strip suffix (e.g., "showrss" → "show")
|
|
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
|
|
s = strings.TrimSuffix(s, affix)
|
|
break
|
|
}
|
|
// Strip prefix (e.g., "rssworld" → "world")
|
|
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
|
|
s = strings.TrimPrefix(s, affix)
|
|
break
|
|
}
|
|
}
|
|
|
|
// Replace underscores and other separators with hyphens
|
|
s = strings.ReplaceAll(s, "_", "-")
|
|
s = strings.ReplaceAll(s, " ", "-")
|
|
|
|
// Remove any characters that aren't alphanumeric or hyphens
|
|
reg := regexp.MustCompile(`[^a-z0-9-]`)
|
|
s = reg.ReplaceAllString(s, "")
|
|
|
|
// Collapse multiple hyphens
|
|
for strings.Contains(s, "--") {
|
|
s = strings.ReplaceAll(s, "--", "-")
|
|
}
|
|
|
|
// Trim leading/trailing hyphens
|
|
s = strings.Trim(s, "-")
|
|
|
|
return s
|
|
}
|
|
|
|
// SplitHandle extracts the path prefix and hostname from a derived handle
|
|
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
|
|
func SplitHandle(handle string) (prefix string, hostname string) {
|
|
// Remove .1440.news suffix
|
|
handle = strings.TrimSuffix(handle, ".1440.news")
|
|
|
|
parts := strings.Split(handle, ".")
|
|
|
|
// Try to find where hostname starts by looking for valid hostname patterns
|
|
if len(parts) >= 2 {
|
|
for i := 0; i < len(parts)-1; i++ {
|
|
remaining := strings.Join(parts[i:], ".")
|
|
if looksLikeHostname(remaining) {
|
|
if i > 0 {
|
|
prefix = strings.Join(parts[:i], ".")
|
|
}
|
|
hostname = remaining
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: no prefix, entire thing is hostname
|
|
hostname = handle
|
|
return "", hostname
|
|
}
|
|
|
|
func isLikelyTLDPart(s string) bool {
|
|
tlds := map[string]bool{
|
|
"com": true, "org": true, "net": true, "edu": true, "gov": true,
|
|
"io": true, "co": true, "uk": true, "de": true, "fr": true,
|
|
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
|
|
"news": true, "blog": true, "tech": true, "dev": true,
|
|
}
|
|
return tlds[s]
|
|
}
|
|
|
|
func isTwoPartTLD(first, second string) bool {
|
|
twoPartTLDs := map[string]bool{
|
|
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
|
|
"org.uk": true, "net.au": true, "com.br": true,
|
|
}
|
|
return twoPartTLDs[first+"."+second]
|
|
}
|
|
|
|
func looksLikeHostname(s string) bool {
|
|
// A hostname typically has at least one dot and ends with a TLD-like part
|
|
parts := strings.Split(s, ".")
|
|
if len(parts) < 2 {
|
|
return false
|
|
}
|
|
lastPart := parts[len(parts)-1]
|
|
return isLikelyTLDPart(lastPart)
|
|
}
|