Files
commons/handle.go
primal 705dc7a40f Rename package from shared to commons
All files now use `package commons` instead of `package shared` to match
the module name and directory.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 15:40:27 -05:00

263 lines
7.4 KiB
Go

package commons
import (
"net/url"
"regexp"
"strings"
)
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {domain}-{category}.1440.news
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
//
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18 // PDS limit for first segment
// Ensure we have a scheme for parsing
if !strings.Contains(feedURL, "://") {
feedURL = "https://" + feedURL
}
u, err := url.Parse(feedURL)
if err != nil {
return ""
}
hostname := strings.ToLower(u.Hostname())
path := strings.ToLower(u.Path)
// Remove common feed suffixes/extensions
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
for _, suffix := range suffixesToRemove {
path = strings.TrimSuffix(path, suffix)
}
// Split path into segments and filter noise
segments := strings.Split(strings.Trim(path, "/"), "/")
skipPathWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true,
}
var pathParts []string
for _, seg := range segments {
seg = cleanHandleSegment(seg)
if seg != "" && !skipPathWords[seg] {
pathParts = append(pathParts, seg)
}
}
// Split hostname and extract the meaningful domain
hostParts := strings.Split(hostname, ".")
// Two-part TLDs to handle specially
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
}
// Check for two-part TLD
if len(hostParts) >= 2 {
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
if twoPartTLDs[possibleTwoPartTLD] {
hostParts = hostParts[:len(hostParts)-2]
} else {
// Single TLD - remove it
singleTLDs := map[string]bool{
"com": true, "org": true, "net": true, "io": true,
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
}
if singleTLDs[hostParts[len(hostParts)-1]] {
hostParts = hostParts[:len(hostParts)-1]
}
}
}
// Skip noise subdomains
skipHostWords := map[string]bool{
"www": true, "feeds": true, "rss": true, "feed": true,
"api": true, "cdn": true, "static": true, "news": true,
}
var meaningfulHostParts []string
for _, part := range hostParts {
if !skipHostWords[part] && part != "" {
meaningfulHostParts = append(meaningfulHostParts, part)
}
}
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
var mainDomain string
if len(meaningfulHostParts) > 0 {
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
} else if len(hostParts) > 0 {
mainDomain = hostParts[len(hostParts)-1]
}
// Special case: "bbci" should become "bbc"
if mainDomain == "bbci" {
mainDomain = "bbc"
}
// Abbreviations for long category names to fit 18-char limit
categoryAbbrevs := map[string]string{
"science-and-environment": "sci-env",
"entertainment-and-arts": "ent-arts",
"science-environment": "sci-env",
"entertainment-arts": "ent-arts",
"technology": "tech",
"business": "biz",
"international": "intl",
"environment": "env",
"entertainment": "ent",
"politics": "pol",
}
// Build subdomain: domain + category (from path)
var subdomain string
if len(pathParts) > 0 {
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
category := pathParts[len(pathParts)-1]
// Skip generic categories
if category == "news" && len(pathParts) == 1 {
subdomain = mainDomain
} else {
// Try to abbreviate if the full subdomain would be too long
fullSubdomain := mainDomain + "-" + category
if len(fullSubdomain) > maxSubdomainLen {
if abbrev, ok := categoryAbbrevs[category]; ok {
category = abbrev
}
}
subdomain = mainDomain + "-" + category
}
} else {
subdomain = mainDomain
}
// If still too long, just use main hostname
if len(subdomain) > maxSubdomainLen {
subdomain = mainDomain
}
// Final safety: truncate if still too long
if len(subdomain) > maxSubdomainLen {
subdomain = subdomain[:maxSubdomainLen]
}
subdomain = strings.Trim(subdomain, "-")
// Collapse multiple hyphens
for strings.Contains(subdomain, "--") {
subdomain = strings.ReplaceAll(subdomain, "--", "-")
}
return subdomain + ".1440.news"
}
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
// Remove file extensions
if idx := strings.LastIndex(s, "."); idx > 0 {
s = s[:idx]
}
// Convert to lowercase
s = strings.ToLower(s)
// Strip common feed prefixes/suffixes from the segment itself
// e.g., "showrss" → "show", "rssworld" → "world"
feedAffixes := []string{"rss", "feed", "atom", "xml"}
for _, affix := range feedAffixes {
// Strip suffix (e.g., "showrss" → "show")
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
s = strings.TrimSuffix(s, affix)
break
}
// Strip prefix (e.g., "rssworld" → "world")
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
s = strings.TrimPrefix(s, affix)
break
}
}
// Replace underscores and other separators with hyphens
s = strings.ReplaceAll(s, "_", "-")
s = strings.ReplaceAll(s, " ", "-")
// Remove any characters that aren't alphanumeric or hyphens
reg := regexp.MustCompile(`[^a-z0-9-]`)
s = reg.ReplaceAllString(s, "")
// Collapse multiple hyphens
for strings.Contains(s, "--") {
s = strings.ReplaceAll(s, "--", "-")
}
// Trim leading/trailing hyphens
s = strings.Trim(s, "-")
return s
}
// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
// Remove .1440.news suffix
handle = strings.TrimSuffix(handle, ".1440.news")
parts := strings.Split(handle, ".")
// Try to find where hostname starts by looking for valid hostname patterns
if len(parts) >= 2 {
for i := 0; i < len(parts)-1; i++ {
remaining := strings.Join(parts[i:], ".")
if looksLikeHostname(remaining) {
if i > 0 {
prefix = strings.Join(parts[:i], ".")
}
hostname = remaining
return
}
}
}
// Fallback: no prefix, entire thing is hostname
hostname = handle
return "", hostname
}
func isLikelyTLDPart(s string) bool {
tlds := map[string]bool{
"com": true, "org": true, "net": true, "edu": true, "gov": true,
"io": true, "co": true, "uk": true, "de": true, "fr": true,
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
"news": true, "blog": true, "tech": true, "dev": true,
}
return tlds[s]
}
func isTwoPartTLD(first, second string) bool {
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
"org.uk": true, "net.au": true, "com.br": true,
}
return twoPartTLDs[first+"."+second]
}
func looksLikeHostname(s string) bool {
// A hostname typically has at least one dot and ends with a TLD-like part
parts := strings.Split(s, ".")
if len(parts) < 2 {
return false
}
lastPart := parts[len(parts)-1]
return isLikelyTLDPart(lastPart)
}