Files
crawler/publisher.go
primal 254b751799 Add rich text links, language filter, and domain deny feature
- Use labeled links (Article · Audio) instead of raw URLs in posts
- Add language filter dropdown to dashboard with toggle selection
- Auto-deny feeds with no language on discovery
- Add deny/undeny buttons for domains to block crawling
- Denied domains set feeds to dead status, preventing future checks

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 12:36:58 -05:00

1196 lines
31 KiB
Go

package main
import (
"bytes"
"crypto/sha256"
"encoding/json"
"fmt"
"image"
_ "image/gif"
"image/jpeg"
_ "image/png"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"unicode/utf8"
_ "golang.org/x/image/webp"
"golang.org/x/image/draw"
)
// Publisher handles posting items to AT Protocol PDS
type Publisher struct {
pdsHost string
httpClient *http.Client
}
// PDSSession holds authentication info for a PDS account
type PDSSession struct {
DID string `json:"did"`
Handle string `json:"handle"`
AccessJwt string `json:"accessJwt"`
RefreshJwt string `json:"refreshJwt"`
}
// BskyPost represents an app.bsky.feed.post record
type BskyPost struct {
Type string `json:"$type"`
Text string `json:"text"`
CreatedAt string `json:"createdAt"`
Facets []BskyFacet `json:"facets,omitempty"`
Embed *BskyEmbed `json:"embed,omitempty"`
}
type BskyFacet struct {
Index BskyByteSlice `json:"index"`
Features []BskyFeature `json:"features"`
}
type BskyByteSlice struct {
ByteStart int `json:"byteStart"`
ByteEnd int `json:"byteEnd"`
}
type BskyFeature struct {
Type string `json:"$type"`
URI string `json:"uri,omitempty"`
}
type BskyEmbed struct {
Type string `json:"$type"`
External *BskyExternal `json:"external,omitempty"`
Images []BskyImage `json:"images,omitempty"`
}
type BskyExternal struct {
URI string `json:"uri"`
Title string `json:"title"`
Description string `json:"description"`
Thumb *BlobRef `json:"thumb,omitempty"`
}
type BskyImage struct {
Alt string `json:"alt"`
Image *BlobRef `json:"image"`
AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"`
}
type BskyAspectRatio struct {
Width int `json:"width"`
Height int `json:"height"`
}
// NewPublisher creates a new Publisher instance
func NewPublisher(pdsHost string) *Publisher {
return &Publisher{
pdsHost: pdsHost,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// CreateSession authenticates with the PDS and returns a session
func (p *Publisher) CreateSession(handle, password string) (*PDSSession, error) {
payload := map[string]string{
"identifier": handle,
"password": password,
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createSession",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("auth failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.NewDecoder(resp.Body).Decode(&session); err != nil {
return nil, err
}
return &session, nil
}
// CreateAccount creates a new account on the PDS
// Requires an invite code if the PDS has invites enabled
func (p *Publisher) CreateAccount(handle, email, password, inviteCode string) (*PDSSession, error) {
payload := map[string]interface{}{
"handle": handle,
"email": email,
"password": password,
}
if inviteCode != "" {
payload["inviteCode"] = inviteCode
}
body, err := json.Marshal(payload)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Post(
p.pdsHost+"/xrpc/com.atproto.server.createAccount",
"application/json",
bytes.NewReader(body),
)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("create account failed: %s - %s", resp.Status, string(respBody))
}
var session PDSSession
if err := json.Unmarshal(respBody, &session); err != nil {
return nil, err
}
return &session, nil
}
// CreateInviteCode creates an invite code using PDS admin password (Basic Auth)
func (p *Publisher) CreateInviteCode(adminPassword string, useCount int) (string, error) {
payload := map[string]interface{}{
"useCount": useCount,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.server.createInviteCode", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
// PDS admin APIs use Basic Auth with "admin" as username
req.SetBasicAuth("admin", adminPassword)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create invite failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Code string `json:"code"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.Code, nil
}
// TID alphabet for base32-sortable encoding
const tidAlphabet = "234567abcdefghijklmnopqrstuvwxyz"
// GenerateRkey creates a deterministic TID-format rkey from a GUID and timestamp
// TIDs are required by Bluesky relay for indexing - custom rkeys don't sync
// Format: 13 chars base32-sortable, 53 bits timestamp + 10 bits clock ID
func GenerateRkey(guid string, timestamp time.Time) string {
if guid == "" {
return ""
}
// Get microseconds since Unix epoch (53 bits)
microsInt := timestamp.UnixMicro()
if microsInt < 0 {
microsInt = 0
}
// Convert to uint64 and mask to 53 bits
micros := uint64(microsInt) & ((1 << 53) - 1)
// Generate deterministic 10-bit clock ID from GUID hash
// Use XOR of multiple hash bytes to reduce collisions
hash := sha256.Sum256([]byte(guid))
// XOR bytes 0-3 together, then 4-7, combine for more entropy
h1 := uint64(hash[0]) ^ uint64(hash[2]) ^ uint64(hash[4]) ^ uint64(hash[6])
h2 := uint64(hash[1]) ^ uint64(hash[3]) ^ uint64(hash[5]) ^ uint64(hash[7])
clockID := (h1 << 2) | (h2 >> 6)
clockID = clockID & ((1 << 10) - 1) // 10 bits = 0-1023
// Combine: top bit 0, 53 bits timestamp, 10 bits clock ID
tid := (micros << 10) | clockID
// Encode as base32-sortable (13 characters)
var result [13]byte
for i := 12; i >= 0; i-- {
result[i] = tidAlphabet[tid&0x1f]
tid >>= 5
}
return string(result[:])
}
// extractURLs finds all URLs in a string
func extractURLs(text string) []string {
// Match http:// or https:// URLs
urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`)
matches := urlRegex.FindAllString(text, -1)
// Clean up trailing punctuation
var urls []string
for _, u := range matches {
// Remove trailing punctuation that's likely not part of the URL
u = strings.TrimRight(u, ".,;:!?")
if u != "" {
urls = append(urls, u)
}
}
return urls
}
// PublishItem posts a feed item to the PDS
// Returns the AT URI of the created record, or error
func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
if item.GUID == "" && item.Link == "" {
return "", fmt.Errorf("item has no GUID or link, cannot publish")
}
// Collect URLs: main link + HN comments link (if applicable)
// Limit to 2 URLs max to stay under 300 grapheme limit
urlSet := make(map[string]bool)
var allURLs []string
// Add main link first
if item.Link != "" {
urlSet[item.Link] = true
allURLs = append(allURLs, item.Link)
}
// For HN feeds, add comments link from description (looks like "https://news.ycombinator.com/item?id=...")
descURLs := extractURLs(item.Description)
for _, u := range descURLs {
if strings.Contains(u, "news.ycombinator.com/item") && !urlSet[u] {
urlSet[u] = true
allURLs = append(allURLs, u)
break // Only add one comments link
}
}
// Add enclosure URL for podcasts/media (audio/video) if we have room
// Bluesky has 300 char limit, so only add if total URLs + minimal title fits
if len(allURLs) < 2 && item.Enclosure != nil && item.Enclosure.URL != "" {
encType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") {
if !urlSet[item.Enclosure.URL] {
// Calculate if enclosure would fit (need ~60 chars for title + separators)
currentURLLen := 0
for _, u := range allURLs {
currentURLLen += len(u) + 2 // +2 for \n\n
}
enclosureLen := len(item.Enclosure.URL) + 2
if currentURLLen+enclosureLen < 235 { // Leave 60 chars for title
urlSet[item.Enclosure.URL] = true
allURLs = append(allURLs, item.Enclosure.URL)
}
}
}
}
// Build post text: title + link labels
// Bluesky has 300 grapheme limit - use rune count as approximation
const maxGraphemes = 295 // Leave some margin
// Create labeled links: "Article", "Audio", etc.
type labeledLink struct {
Label string
URL string
}
var links []labeledLink
for i, u := range allURLs {
if i == 0 {
// First URL is the article link
links = append(links, labeledLink{Label: "Article", URL: u})
} else if item.Enclosure != nil && u == item.Enclosure.URL {
// Enclosure URL - label based on type
encType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(encType, "audio/") {
links = append(links, labeledLink{Label: "Audio", URL: u})
} else if strings.HasPrefix(encType, "video/") {
links = append(links, labeledLink{Label: "Video", URL: u})
} else {
links = append(links, labeledLink{Label: "Media", URL: u})
}
} else if strings.Contains(u, "news.ycombinator.com") {
links = append(links, labeledLink{Label: "Comments", URL: u})
} else {
links = append(links, labeledLink{Label: "Link", URL: u})
}
}
// Calculate space needed for labels (in runes)
// Format: "Article · Audio" or just "Article"
labelSpace := 0
for i, link := range links {
labelSpace += utf8.RuneCountInString(link.Label)
if i > 0 {
labelSpace += 3 // " · " separator
}
}
labelSpace += 2 // \n\n before labels
// Truncate title if needed
title := item.Title
titleRunes := utf8.RuneCountInString(title)
maxTitleRunes := maxGraphemes - labelSpace - 3 // -3 for "..."
if titleRunes+labelSpace > maxGraphemes {
if maxTitleRunes > 10 {
runes := []rune(title)
if len(runes) > maxTitleRunes {
title = string(runes[:maxTitleRunes]) + "..."
}
} else {
runes := []rune(title)
if len(runes) > 50 {
title = string(runes[:50]) + "..."
}
}
}
// Build final text with labels
var textBuilder strings.Builder
textBuilder.WriteString(title)
if len(links) > 0 {
textBuilder.WriteString("\n\n")
for i, link := range links {
if i > 0 {
textBuilder.WriteString(" · ")
}
textBuilder.WriteString(link.Label)
}
}
text := textBuilder.String()
// Use original publication date if available, otherwise current time
createdAt := time.Now()
if !item.PubDate.IsZero() {
createdAt = item.PubDate
}
post := BskyPost{
Type: "app.bsky.feed.post",
Text: text,
CreatedAt: createdAt.Format(time.RFC3339),
}
// Add facets for labeled links
// Find each label in the text and create a facet linking to its URL
searchPos := len(title) + 2 // Start after title + \n\n
for _, link := range links {
labelStart := strings.Index(text[searchPos:], link.Label)
if labelStart >= 0 {
labelStart += searchPos
byteStart := len(text[:labelStart])
byteEnd := byteStart + len(link.Label)
post.Facets = append(post.Facets, BskyFacet{
Index: BskyByteSlice{
ByteStart: byteStart,
ByteEnd: byteEnd,
},
Features: []BskyFeature{
{
Type: "app.bsky.richtext.facet#link",
URI: link.URL,
},
},
})
searchPos = labelStart + len(link.Label)
}
}
// Decide embed type based on content
// Priority: images > external link card
if len(item.ImageURLs) > 0 {
// Try to upload images (up to 4)
uploadedImages := p.uploadImages(session, item.ImageURLs, item.Title)
if len(uploadedImages) > 0 {
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.images",
Images: uploadedImages,
}
}
}
// Fall back to external embed if no images were uploaded
if post.Embed == nil && len(allURLs) > 0 {
external := &BskyExternal{
URI: allURLs[0],
Title: item.Title,
Description: truncate(stripHTML(item.Description), 300),
}
// Try to add thumbnail from first image
if len(item.ImageURLs) > 0 {
if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
external.Thumb = thumb
}
}
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.external",
External: external,
}
}
// Use GUID + discoveredAt for deterministic rkey
// This allows regenerating a new rkey by updating discoveredAt if needed
guidForRkey := item.GUID
if guidForRkey == "" {
guidForRkey = item.Link
}
// Use PubDate for rkey to match createdAt ordering, fall back to DiscoveredAt
rkeyTime := item.PubDate
if rkeyTime.IsZero() {
rkeyTime = item.DiscoveredAt
}
rkey := GenerateRkey(guidForRkey, rkeyTime)
// Create the record with deterministic rkey
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.feed.post",
"rkey": rkey,
"record": post,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
URI string `json:"uri"`
CID string `json:"cid"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.URI, nil
}
// uploadImages fetches and uploads up to 4 images, returning BskyImage structs
func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altText string) []BskyImage {
var images []BskyImage
maxImages := 4
if len(imageURLs) < maxImages {
maxImages = len(imageURLs)
}
for i := 0; i < maxImages; i++ {
result := p.fetchAndUploadImageWithDimensions(session, imageURLs[i])
if result != nil && result.Blob != nil {
img := BskyImage{
Alt: altText,
Image: result.Blob,
}
if result.Width > 0 && result.Height > 0 {
img.AspectRatio = &BskyAspectRatio{
Width: result.Width,
Height: result.Height,
}
}
images = append(images, img)
}
}
return images
}
// fetchAndUploadImage downloads an image and uploads it to the PDS
// FetchFavicon tries to get a favicon URL for a site
// Returns the favicon URL or empty string if not found
func (p *Publisher) FetchFavicon(siteURL string) string {
if siteURL == "" {
return ""
}
// Parse the site URL to get the host
if !strings.Contains(siteURL, "://") {
siteURL = "https://" + siteURL
}
u, err := url.Parse(siteURL)
if err != nil {
return ""
}
// Try common favicon locations
faviconURLs := []string{
fmt.Sprintf("https://%s/favicon.ico", u.Host),
fmt.Sprintf("https://%s/favicon.png", u.Host),
fmt.Sprintf("https://%s/apple-touch-icon.png", u.Host),
}
for _, faviconURL := range faviconURLs {
resp, err := p.httpClient.Head(faviconURL)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
contentType := resp.Header.Get("Content-Type")
if strings.HasPrefix(contentType, "image/") || strings.HasSuffix(faviconURL, ".ico") {
return faviconURL
}
}
}
// Fallback to Google's favicon service (reliable, returns PNG)
return fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host)
}
// ImageUploadResult contains the uploaded blob and image dimensions
type ImageUploadResult struct {
Blob *BlobRef
Width int
Height int
}
func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef {
result := p.fetchAndUploadImageWithDimensions(session, imageURL)
if result == nil {
return nil
}
return result.Blob
}
// upgradeImageURL attempts to get a larger version of known CDN image URLs
func upgradeImageURL(imageURL string) string {
// BBC images: /standard/240/ -> /standard/800/
if strings.Contains(imageURL, "ichef.bbci.co.uk") {
imageURL = strings.Replace(imageURL, "/standard/240/", "/standard/800/", 1)
imageURL = strings.Replace(imageURL, "/standard/480/", "/standard/800/", 1)
}
return imageURL
}
func (p *Publisher) fetchAndUploadImageWithDimensions(session *PDSSession, imageURL string) *ImageUploadResult {
// Upgrade image URL to larger size if possible
imageURL = upgradeImageURL(imageURL)
// Fetch the image
resp, err := p.httpClient.Get(imageURL)
if err != nil {
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil
}
// Check content type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
// Try to guess from URL
if strings.HasSuffix(strings.ToLower(imageURL), ".png") {
contentType = "image/png"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".gif") {
contentType = "image/gif"
} else if strings.HasSuffix(strings.ToLower(imageURL), ".webp") {
contentType = "image/webp"
} else {
contentType = "image/jpeg" // Default
}
}
// Only accept image types
if !strings.HasPrefix(contentType, "image/") {
return nil
}
// Read image data (limit to 2MB to allow for resize headroom)
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil || len(data) == 0 {
return nil
}
// Decode image to get dimensions
imgConfig, _, err := image.DecodeConfig(bytes.NewReader(data))
width, height := 1, 1 // Default if decode fails
if err == nil {
width, height = imgConfig.Width, imgConfig.Height
}
// Bluesky blob limit is ~976KB, use 900KB as safe threshold
const maxBlobSize = 900 * 1024
// If image is too large, resize it
if len(data) > maxBlobSize {
// Decode the full image for resizing
img, _, err := image.Decode(bytes.NewReader(data))
if err != nil {
return nil // Can't decode, can't resize
}
// Scale down iteratively until under limit
scaleFactor := 0.9 // Start with 90% and iterate if needed
for attempt := 0; attempt < 5; attempt++ {
newWidth := int(float64(width) * scaleFactor)
newHeight := int(float64(height) * scaleFactor)
// Minimum dimensions
if newWidth < 100 {
newWidth = 100
}
if newHeight < 100 {
newHeight = 100
}
// Create resized image
resized := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
draw.CatmullRom.Scale(resized, resized.Bounds(), img, img.Bounds(), draw.Over, nil)
// Encode as JPEG
var buf bytes.Buffer
if err := jpeg.Encode(&buf, resized, &jpeg.Options{Quality: 85}); err != nil {
return nil
}
if buf.Len() <= maxBlobSize {
data = buf.Bytes()
width = newWidth
height = newHeight
contentType = "image/jpeg"
break
}
// Still too large, reduce scale further
scaleFactor *= 0.8
}
// If still too large after 5 attempts, give up
if len(data) > maxBlobSize {
return nil
}
}
// Upload to PDS
blob, err := p.UploadBlob(session, data, contentType)
if err != nil {
return nil
}
return &ImageUploadResult{
Blob: blob,
Width: width,
Height: height,
}
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen-3] + "..."
}
// stripHTML removes HTML tags from a string
func stripHTML(s string) string {
// Remove HTML tags
tagRegex := regexp.MustCompile(`<[^>]*>`)
s = tagRegex.ReplaceAllString(s, "")
// Decode common HTML entities
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", "\"")
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
// Collapse whitespace
spaceRegex := regexp.MustCompile(`\s+`)
s = spaceRegex.ReplaceAllString(s, " ")
return strings.TrimSpace(s)
}
// DeriveHandleFromFeed generates an AT Protocol handle from a feed URL
// Format: {domain}-{category}.1440.news
// AT Protocol allows up to 63 characters per label, but the PDS
// restricts the first segment to 18 characters for local handles.
// Examples:
// feeds.bbci.co.uk/news/technology/rss.xml → bbc-technology.1440.news
// news.ycombinator.com/rss → ycombinator.1440.news
func DeriveHandleFromFeed(feedURL string) string {
const maxSubdomainLen = 18 // PDS limit for first segment
// Ensure we have a scheme for parsing
if !strings.Contains(feedURL, "://") {
feedURL = "https://" + feedURL
}
u, err := url.Parse(feedURL)
if err != nil {
return ""
}
hostname := strings.ToLower(u.Hostname())
path := strings.ToLower(u.Path)
// Remove common feed suffixes/extensions
suffixesToRemove := []string{".xml", ".rss", ".atom", ".json", "/rss", "/feed", "/atom", "/index"}
for _, suffix := range suffixesToRemove {
path = strings.TrimSuffix(path, suffix)
}
// Split path into segments and filter noise
segments := strings.Split(strings.Trim(path, "/"), "/")
skipPathWords := map[string]bool{
"rss": true, "feed": true, "feeds": true, "atom": true,
"xml": true, "default": true, "index": true, "services": true,
"nyt": true, "blog": true,
}
var pathParts []string
for _, seg := range segments {
seg = cleanHandleSegment(seg)
if seg != "" && !skipPathWords[seg] {
pathParts = append(pathParts, seg)
}
}
// Split hostname and extract the meaningful domain
hostParts := strings.Split(hostname, ".")
// Two-part TLDs to handle specially
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.nz": true, "co.jp": true,
"com.br": true, "co.in": true, "org.uk": true, "ac.uk": true,
}
// Check for two-part TLD
if len(hostParts) >= 2 {
possibleTwoPartTLD := hostParts[len(hostParts)-2] + "." + hostParts[len(hostParts)-1]
if twoPartTLDs[possibleTwoPartTLD] {
hostParts = hostParts[:len(hostParts)-2]
} else {
// Single TLD - remove it
singleTLDs := map[string]bool{
"com": true, "org": true, "net": true, "io": true,
"edu": true, "gov": true, "uk": true, "de": true, "fr": true,
}
if singleTLDs[hostParts[len(hostParts)-1]] {
hostParts = hostParts[:len(hostParts)-1]
}
}
}
// Skip noise subdomains
skipHostWords := map[string]bool{
"www": true, "feeds": true, "rss": true, "feed": true,
"api": true, "cdn": true, "static": true, "news": true,
}
var meaningfulHostParts []string
for _, part := range hostParts {
if !skipHostWords[part] && part != "" {
meaningfulHostParts = append(meaningfulHostParts, part)
}
}
// Get the main domain (e.g., "bbci", "ycombinator", "nytimes")
var mainDomain string
if len(meaningfulHostParts) > 0 {
mainDomain = meaningfulHostParts[len(meaningfulHostParts)-1]
} else if len(hostParts) > 0 {
mainDomain = hostParts[len(hostParts)-1]
}
// Special case: "bbci" should become "bbc"
if mainDomain == "bbci" {
mainDomain = "bbc"
}
// Abbreviations for long category names to fit 18-char limit
categoryAbbrevs := map[string]string{
"science-and-environment": "sci-env",
"entertainment-and-arts": "ent-arts",
"science-environment": "sci-env",
"entertainment-arts": "ent-arts",
"technology": "tech",
"business": "biz",
"international": "intl",
"environment": "env",
"entertainment": "ent",
"politics": "pol",
}
// Build subdomain: domain + category (from path)
var subdomain string
if len(pathParts) > 0 {
// Use last meaningful path part as category (e.g., "technology" from /news/technology/)
category := pathParts[len(pathParts)-1]
// Skip generic categories
if category == "news" && len(pathParts) == 1 {
subdomain = mainDomain
} else {
// Try to abbreviate if the full subdomain would be too long
fullSubdomain := mainDomain + "-" + category
if len(fullSubdomain) > maxSubdomainLen {
if abbrev, ok := categoryAbbrevs[category]; ok {
category = abbrev
}
}
subdomain = mainDomain + "-" + category
}
} else {
subdomain = mainDomain
}
// If still too long, just use main hostname
if len(subdomain) > maxSubdomainLen {
subdomain = mainDomain
}
// Final safety: truncate if still too long
if len(subdomain) > maxSubdomainLen {
subdomain = subdomain[:maxSubdomainLen]
}
subdomain = strings.Trim(subdomain, "-")
// Collapse multiple hyphens
for strings.Contains(subdomain, "--") {
subdomain = strings.ReplaceAll(subdomain, "--", "-")
}
return subdomain + ".1440.news"
}
// cleanHandleSegment sanitizes a string for use in an AT Protocol handle segment
// Handle segments must be alphanumeric with hyphens, no leading/trailing hyphens
func cleanHandleSegment(s string) string {
// Remove file extensions
if idx := strings.LastIndex(s, "."); idx > 0 {
s = s[:idx]
}
// Convert to lowercase
s = strings.ToLower(s)
// Strip common feed prefixes/suffixes from the segment itself
// e.g., "showrss" → "show", "rssworld" → "world"
feedAffixes := []string{"rss", "feed", "atom", "xml"}
for _, affix := range feedAffixes {
// Strip suffix (e.g., "showrss" → "show")
if strings.HasSuffix(s, affix) && len(s) > len(affix) {
s = strings.TrimSuffix(s, affix)
break
}
// Strip prefix (e.g., "rssworld" → "world")
if strings.HasPrefix(s, affix) && len(s) > len(affix) {
s = strings.TrimPrefix(s, affix)
break
}
}
// Replace underscores and other separators with hyphens
s = strings.ReplaceAll(s, "_", "-")
s = strings.ReplaceAll(s, " ", "-")
// Remove any characters that aren't alphanumeric or hyphens
reg := regexp.MustCompile(`[^a-z0-9-]`)
s = reg.ReplaceAllString(s, "")
// Collapse multiple hyphens
for strings.Contains(s, "--") {
s = strings.ReplaceAll(s, "--", "-")
}
// Trim leading/trailing hyphens
s = strings.Trim(s, "-")
return s
}
// SplitHandle extracts the path prefix and hostname from a derived handle
// Example: show.news.ycombinator.com.1440.news → ("show", "news.ycombinator.com")
func SplitHandle(handle string) (prefix string, hostname string) {
// Remove .1440.news suffix
handle = strings.TrimSuffix(handle, ".1440.news")
parts := strings.Split(handle, ".")
// Try to find where hostname starts by looking for valid hostname patterns
if len(parts) >= 2 {
for i := 0; i < len(parts)-1; i++ {
remaining := strings.Join(parts[i:], ".")
if looksLikeHostname(remaining) {
if i > 0 {
prefix = strings.Join(parts[:i], ".")
}
hostname = remaining
return
}
}
}
// Fallback: no prefix, entire thing is hostname
hostname = handle
return "", hostname
}
func isLikelyTLDPart(s string) bool {
tlds := map[string]bool{
"com": true, "org": true, "net": true, "edu": true, "gov": true,
"io": true, "co": true, "uk": true, "de": true, "fr": true,
"jp": true, "au": true, "ca": true, "nl": true, "se": true,
"news": true, "blog": true, "tech": true, "dev": true,
}
return tlds[s]
}
func isTwoPartTLD(first, second string) bool {
twoPartTLDs := map[string]bool{
"co.uk": true, "com.au": true, "co.jp": true, "co.nz": true,
"org.uk": true, "net.au": true, "com.br": true,
}
return twoPartTLDs[first+"."+second]
}
func looksLikeHostname(s string) bool {
// A hostname typically has at least one dot and ends with a TLD-like part
parts := strings.Split(s, ".")
if len(parts) < 2 {
return false
}
lastPart := parts[len(parts)-1]
return isLikelyTLDPart(lastPart)
}
// BlobRef represents a blob reference for profile images
type BlobRef struct {
Type string `json:"$type"`
Ref Link `json:"ref"`
MimeType string `json:"mimeType"`
Size int64 `json:"size"`
}
type Link struct {
Link string `json:"$link"`
}
// UploadBlob uploads an image to the PDS and returns a blob reference
func (p *Publisher) UploadBlob(session *PDSSession, data []byte, mimeType string) (*BlobRef, error) {
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.uploadBlob", bytes.NewReader(data))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", mimeType)
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("upload blob failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
Blob BlobRef `json:"blob"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, err
}
return &result.Blob, nil
}
// UpdateProfile updates the profile for an account
func (p *Publisher) UpdateProfile(session *PDSSession, displayName, description string, avatar *BlobRef) error {
// First, get the current profile to preserve any existing fields
getReq, err := http.NewRequest("GET",
p.pdsHost+"/xrpc/com.atproto.repo.getRecord?repo="+session.DID+"&collection=app.bsky.actor.profile&rkey=self",
nil)
if err != nil {
return err
}
getReq.Header.Set("Authorization", "Bearer "+session.AccessJwt)
getResp, err := p.httpClient.Do(getReq)
var existingCID string
profile := map[string]interface{}{
"$type": "app.bsky.actor.profile",
}
if err == nil && getResp.StatusCode == http.StatusOK {
defer getResp.Body.Close()
var existing struct {
CID string `json:"cid"`
Value map[string]interface{} `json:"value"`
}
if json.NewDecoder(getResp.Body).Decode(&existing) == nil {
existingCID = existing.CID
profile = existing.Value
}
} else if getResp != nil {
getResp.Body.Close()
}
// Update fields
if displayName != "" {
profile["displayName"] = displayName
}
if description != "" {
profile["description"] = description
}
if avatar != nil {
profile["avatar"] = avatar
}
// Put the record
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.actor.profile",
"rkey": "self",
"record": profile,
}
if existingCID != "" {
payload["swapRecord"] = existingCID
}
body, err := json.Marshal(payload)
if err != nil {
return err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.putRecord", bytes.NewReader(body))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("update profile failed: %s - %s", resp.Status, string(respBody))
}
return nil
}
// FetchFavicon downloads a favicon/icon from a URL
func FetchFavicon(siteURL string) ([]byte, string, error) {
// Try common favicon locations
if !strings.HasPrefix(siteURL, "http") {
siteURL = "https://" + siteURL
}
u, err := url.Parse(siteURL)
if err != nil {
return nil, "", err
}
baseURL := u.Scheme + "://" + u.Host
// Try apple-touch-icon first (usually higher quality)
iconURLs := []string{
baseURL + "/apple-touch-icon.png",
baseURL + "/apple-touch-icon-precomposed.png",
baseURL + "/favicon.png",
baseURL + "/favicon.ico",
}
client := &http.Client{Timeout: 10 * time.Second}
for _, iconURL := range iconURLs {
resp, err := client.Get(iconURL)
if err != nil {
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
continue
}
data, err := io.ReadAll(resp.Body)
if err != nil {
continue
}
// Determine mime type
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
if strings.HasSuffix(iconURL, ".png") {
contentType = "image/png"
} else if strings.HasSuffix(iconURL, ".ico") {
contentType = "image/x-icon"
} else {
contentType = "image/png" // default
}
}
return data, contentType, nil
}
return nil, "", fmt.Errorf("no favicon found for %s", siteURL)
}