commons/handle.go

package commons

import (
	"net/url"
	"regexp"
	"strings"
)

// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {domain}-{category}.1440.news
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
//
//	feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
//	news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
	const maxSubdomainLen = 18 // PDS limit for first segment

	// Ensure we have a scheme for parsing
	if !strings.Contains(feedURL, "://") {
		feedURL = "https://" + feedURL
	}

	u, err := url.Parse(feedURL)
	if err != nil {
		return ""
	}

	hostname := strings.ToLower(u.Hostname())
	path := strings.ToLower(u.Path)

	// Remove common feed suffixes/extensions
	suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
	for _, suffix := range suffixesToRemove {
		path = strings.TrimSuffix(path, suffix)
	}

	// Split path into segments and filter noise
	segments := strings.Split(strings.Trim(path, "/"), "/")
	skipPathWords := map[string]bool{
		"rss": true, "feed": true, "feeds": true, "atom": true,
		"xml": true, "default": true, "index": true, "services": true,
		"nyt": true,
	}

	var pathParts []string
	for _, seg := range segments {
		seg = cleanHandleSegment(seg)
		if seg != "" && !skipPathWords[seg] {
			pathParts = append(pathParts, seg)
		}
	}

	// Split hostname and extract the meaningful domain
	hostParts := strings.Split(hostname, ".")

	// Two-part TLDs to handle specially
	twoPartTLDs := map[string]bool{
		"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
		"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
	}

	// Check for two-part TLD
	if len(hostParts) >= 2 {
		possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
		if twoPartTLDs[possibleTwoPartTLD] {
			hostParts = hostParts[:len(hostParts)-2]
		} else {
			// Single TLD - remove it
			singleTLDs := map[string]bool{
				"com": true, "org": true, "net": true, "io": true,
				"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
			}
			if singleTLDs[hostParts[len(hostParts)-1]] {
				hostParts = hostParts[:len(hostParts)-1]
			}
		}
	}

	// Skip noise subdomains
	skipHostWords := map[string]bool{
		"www": true, "feeds": true, "rss": true, "feed": true,
		"api": true, "cdn": true, "static": true, "news": true,
	}

	var meaningfulHostParts []string
	for _, part := range hostParts {
		if !skipHostWords[part] && part != "" {
			meaningfulHostParts = append(meaningfulHostParts, part)
		}
	}

	// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
	var mainDomain string
	if len(meaningfulHostParts) > 0 {
		mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
	} else if len(hostParts) > 0 {
		mainDomain = hostParts[len(hostParts)-1]
	}

	// Special case: "bbci" should become "bbc"
	if mainDomain == "bbci" {
		mainDomain = "bbc"
	}

	// Abbreviations for long category names to fit 18-char limit
	categoryAbbrevs := map[string]string{
		"science-and-environment": "sci-env",
		"entertainment-and-arts":  "ent-arts",
		"science-environment":     "sci-env",
		"entertainment-arts":      "ent-arts",
		"technology":              "tech",
		"business":                "biz",
		"international":           "intl",
		"environment":             "env",
		"entertainment":           "ent",
		"politics":                "pol",
	}

	// Build subdomain: domain + category (from path)
	var subdomain string
	if len(pathParts) > 0 {
		// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
		category := pathParts[len(pathParts)-1]
		// Skip generic categories
		if category == "news" && len(pathParts) == 1 {
			subdomain = mainDomain
		} else {
			// Try to abbreviate if the full subdomain would be too long
			fullSubdomain := mainDomain + "-" + category
			if len(fullSubdomain) > maxSubdomainLen {
				if abbrev, ok := categoryAbbrevs[category]; ok {
					category = abbrev
				}
			}
			subdomain = mainDomain + "-" + category
		}
	} else {
		subdomain = mainDomain
	}

	// If still too long, just use main hostname
	if len(subdomain) > maxSubdomainLen {
		subdomain = mainDomain
	}

	// Final safety: truncate if still too long
	if len(subdomain) > maxSubdomainLen {
		subdomain = subdomain[:maxSubdomainLen]
	}

	subdomain = strings.Trim(subdomain, "-")

	// Collapse multiple hyphens
	for strings.Contains(subdomain, "--") {
		subdomain = strings.ReplaceAll(subdomain, "--", "-")
	}

	return subdomain + ".1440.news"
}

// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
	// Remove file extensions
	if idx := strings.LastIndex(s, "."); idx > 0 {
		s = s[:idx]
	}

	// Convert to lowercase
	s = strings.ToLower(s)

	// Strip common feed prefixes/suffixes from the segment itself
	// e.g., "showrss" → "show", "rssworld" → "world"
	feedAffixes := []string{"rss", "feed", "atom", "xml"}
	for _, affix := range feedAffixes {
		// Strip suffix (e.g., "showrss" → "show")
		if strings.HasSuffix(s, affix) && len(s) > len(affix) {
			s = strings.TrimSuffix(s, affix)
			break
		}
		// Strip prefix (e.g., "rssworld" → "world")
		if strings.HasPrefix(s, affix) && len(s) > len(affix) {
			s = strings.TrimPrefix(s, affix)
			break
		}
	}

	// Replace underscores and other separators with hyphens
	s = strings.ReplaceAll(s, "_", "-")
	s = strings.ReplaceAll(s, " ", "-")

	// Remove any characters that aren't alphanumeric or hyphens
	reg := regexp.MustCompile(`[^a-z0-9-]`)
	s = reg.ReplaceAllString(s, "")

	// Collapse multiple hyphens
	for strings.Contains(s, "--") {
		s = strings.ReplaceAll(s, "--", "-")
	}

	// Trim leading/trailing hyphens
	s = strings.Trim(s, "-")

	return s
}

// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
	// Remove .1440.news suffix
	handle = strings.TrimSuffix(handle, ".1440.news")

	parts := strings.Split(handle, ".")

	// Try to find where hostname starts by looking for valid hostname patterns
	if len(parts) >= 2 {
		for i := 0; i < len(parts)-1; i++ {
			remaining := strings.Join(parts[i:], ".")
			if looksLikeHostname(remaining) {
				if i > 0 {
					prefix = strings.Join(parts[:i], ".")
				}
				hostname = remaining
				return
			}
		}
	}

	// Fallback: no prefix, entire thing is hostname
	hostname = handle
	return "", hostname
}

func isLikelyTLDPart(s string) bool {
	tlds := map[string]bool{
		"com": true, "org": true, "net": true, "edu": true, "gov": true,
		"io": true, "co": true, "uk": true, "de": true, "fr": true,
		"jp": true, "au": true, "ca": true, "nl": true, "se": true,
		"news": true, "blog": true, "tech": true, "dev": true,
	}
	return tlds[s]
}

func isTwoPartTLD(first, second string) bool {
	twoPartTLDs := map[string]bool{
		"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
		"org.uk": true, "net.au": true, "com.br": true,
	}
	return twoPartTLDs[first+"."+second]
}

func looksLikeHostname(s string) bool {
	// A hostname typically has at least one dot and ends with a TLD-like part
	parts := strings.Split(s, ".")
	if len(parts) < 2 {
		return false
	}
	lastPart := parts[len(parts)-1]
	return isLikelyTLDPart(lastPart)
}