package shared import ( "net/url" "regexp" "strings" ) // DeriveHandleFromFeed generates an AT Protocol handle from a feed URL // Format: {domain}-{category}.1440.news // AT Protocol allows up to 63 characters per label, but the PDS // restricts the first segment to 18 characters for local handles. // Examples: // // feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news // news.ycombinator.com/rss → ycombinator.1440.news func DeriveHandleFromFeed(feedURL string) string { const maxSubdomainLen = 18 // PDS limit for first segment // Ensure we have a scheme for parsing if !strings.Contains(feedURL, "://") { feedURL = "https://" + feedURL } u, err := url.Parse(feedURL) if err != nil { return "" } hostname := strings.ToLower(u.Hostname()) path := strings.ToLower(u.Path) // Remove common feed suffixes/extensions suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"} for _, suffix := range suffixesToRemove { path = strings.TrimSuffix(path, suffix) } // Split path into segments and filter noise segments := strings.Split(strings.Trim(path, "/"), "/") skipPathWords := map[string]bool{ "rss": true, "feed": true, "feeds": true, "atom": true, "xml": true, "default": true, "index": true, "services": true, "nyt": true, } var pathParts []string for _, seg := range segments { seg = cleanHandleSegment(seg) if seg != "" && !skipPathWords[seg] { pathParts = append(pathParts, seg) } } // Split hostname and extract the meaningful domain hostParts := strings.Split(hostname, ".") // Two-part TLDs to handle specially twoPartTLDs := map[string]bool{ "co.uk": true, "com.au": true, "co.nz": true, "co.jp": true, "com.br": true, "co.in": true, "org.uk": true, "ac.uk": true, } // Check for two-part TLD if len(hostParts) >= 2 { possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1] if twoPartTLDs[possibleTwoPartTLD] { hostParts = hostParts[:len(hostParts)-2] } else { // Single TLD - remove it singleTLDs := map[string]bool{ "com": true, "org": true, "net": true, "io": true, "edu": true, "gov": true, "uk": true, "de": true, "fr": true, } if singleTLDs[hostParts[len(hostParts)-1]] { hostParts = hostParts[:len(hostParts)-1] } } } // Skip noise subdomains skipHostWords := map[string]bool{ "www": true, "feeds": true, "rss": true, "feed": true, "api": true, "cdn": true, "static": true, "news": true, } var meaningfulHostParts []string for _, part := range hostParts { if !skipHostWords[part] && part != "" { meaningfulHostParts = append(meaningfulHostParts, part) } } // Get the main domain (e.g., "bbci", "ycombinator", "nytimes") var mainDomain string if len(meaningfulHostParts) > 0 { mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1] } else if len(hostParts) > 0 { mainDomain = hostParts[len(hostParts)-1] } // Special case: "bbci" should become "bbc" if mainDomain == "bbci" { mainDomain = "bbc" } // Abbreviations for long category names to fit 18-char limit categoryAbbrevs := map[string]string{ "science-and-environment": "sci-env", "entertainment-and-arts": "ent-arts", "science-environment": "sci-env", "entertainment-arts": "ent-arts", "technology": "tech", "business": "biz", "international": "intl", "environment": "env", "entertainment": "ent", "politics": "pol", } // Build subdomain: domain + category (from path) var subdomain string if len(pathParts) > 0 { // Use last meaningful path part as category (e.g., "technology" from /news/technology/) category := pathParts[len(pathParts)-1] // Skip generic categories if category == "news" && len(pathParts) == 1 { subdomain = mainDomain } else { // Try to abbreviate if the full subdomain would be too long fullSubdomain := mainDomain + "-" + category if len(fullSubdomain) > maxSubdomainLen { if abbrev, ok := categoryAbbrevs[category]; ok { category = abbrev } } subdomain = mainDomain + "-" + category } } else { subdomain = mainDomain } // If still too long, just use main hostname if len(subdomain) > maxSubdomainLen { subdomain = mainDomain } // Final safety: truncate if still too long if len(subdomain) > maxSubdomainLen { subdomain = subdomain[:maxSubdomainLen] } subdomain = strings.Trim(subdomain, "-") // Collapse multiple hyphens for strings.Contains(subdomain, "--") { subdomain = strings.ReplaceAll(subdomain, "--", "-") } return subdomain + ".1440.news" } // cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment // Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens func cleanHandleSegment(s string) string { // Remove file extensions if idx := strings.LastIndex(s, "."); idx > 0 { s = s[:idx] } // Convert to lowercase s = strings.ToLower(s) // Strip common feed prefixes/suffixes from the segment itself // e.g., "showrss" → "show", "rssworld" → "world" feedAffixes := []string{"rss", "feed", "atom", "xml"} for _, affix := range feedAffixes { // Strip suffix (e.g., "showrss" → "show") if strings.HasSuffix(s, affix) && len(s) > len(affix) { s = strings.TrimSuffix(s, affix) break } // Strip prefix (e.g., "rssworld" → "world") if strings.HasPrefix(s, affix) && len(s) > len(affix) { s = strings.TrimPrefix(s, affix) break } } // Replace underscores and other separators with hyphens s = strings.ReplaceAll(s, "_", "-") s = strings.ReplaceAll(s, " ", "-") // Remove any characters that aren't alphanumeric or hyphens reg := regexp.MustCompile(`[^a-z0-9-]`) s = reg.ReplaceAllString(s, "") // Collapse multiple hyphens for strings.Contains(s, "--") { s = strings.ReplaceAll(s, "--", "-") } // Trim leading/trailing hyphens s = strings.Trim(s, "-") return s } // SplitHandle extracts the path prefix and hostname from a derived handle // Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com") func SplitHandle(handle string) (prefix string, hostname string) { // Remove .1440.news suffix handle = strings.TrimSuffix(handle, ".1440.news") parts := strings.Split(handle, ".") // Try to find where hostname starts by looking for valid hostname patterns if len(parts) >= 2 { for i := 0; i < len(parts)-1; i++ { remaining := strings.Join(parts[i:], ".") if looksLikeHostname(remaining) { if i > 0 { prefix = strings.Join(parts[:i], ".") } hostname = remaining return } } } // Fallback: no prefix, entire thing is hostname hostname = handle return "", hostname } func isLikelyTLDPart(s string) bool { tlds := map[string]bool{ "com": true, "org": true, "net": true, "edu": true, "gov": true, "io": true, "co": true, "uk": true, "de": true, "fr": true, "jp": true, "au": true, "ca": true, "nl": true, "se": true, "news": true, "blog": true, "tech": true, "dev": true, } return tlds[s] } func isTwoPartTLD(first, second string) bool { twoPartTLDs := map[string]bool{ "co.uk": true, "com.au": true, "co.jp": true, "co.nz": true, "org.uk": true, "net.au": true, "com.br": true, } return twoPartTLDs[first+"."+second] } func looksLikeHostname(s string) bool { // A hostname typically has at least one dot and ends with a TLD-like part parts := strings.Split(s, ".") if len(parts) < 2 { return false } lastPart := parts[len(parts)-1] return isLikelyTLDPart(lastPart) }