Files
crawler/publisher.go
primal 1066f42189 Refactor large Go files into focused modules
Split dashboard.go (3,528 lines) into:
- routes.go: HTTP route registration
- api_domains.go: Domain API handlers
- api_feeds.go: Feed API handlers
- api_publish.go: Publishing API handlers
- api_search.go: Search API handlers
- templates.go: HTML templates
- dashboard.go: Stats functions only (235 lines)

Split publisher.go (1,502 lines) into:
- pds_auth.go: Authentication and account management
- pds_records.go: Record operations (upload, update, delete)
- handle.go: Handle derivation from feed URLs
- image.go: Image processing and favicon fetching
- publisher.go: Core types and PublishItem (439 lines)

Split feed.go (1,137 lines) into:
- item.go: Item struct and DB operations
- feed_check.go: Feed checking and processing
- feed.go: Feed struct and DB operations (565 lines)

Also includes domain import batch size increase (1k -> 100k).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 22:25:02 -05:00

440 lines
11 KiB
Go

package main
import (
"bytes"
"crypto/sha256"
"encoding/json"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"time"
)
// Publisher handles posting items to AT Protocol PDS
type Publisher struct {
pdsHost string
httpClient *http.Client
}
// PDSSession holds authentication info for a PDS account
type PDSSession struct {
DID string `json:"did"`
Handle string `json:"handle"`
AccessJwt string `json:"accessJwt"`
RefreshJwt string `json:"refreshJwt"`
}
// BskyPost represents an app.bsky.feed.post record
type BskyPost struct {
Type string `json:"$type"`
Text string `json:"text"`
CreatedAt string `json:"createdAt"`
Facets []BskyFacet `json:"facets,omitempty"`
Embed *BskyEmbed `json:"embed,omitempty"`
}
type BskyFacet struct {
Index BskyByteSlice `json:"index"`
Features []BskyFeature `json:"features"`
}
type BskyByteSlice struct {
ByteStart int `json:"byteStart"`
ByteEnd int `json:"byteEnd"`
}
type BskyFeature struct {
Type string `json:"$type"`
URI string `json:"uri,omitempty"`
Tag string `json:"tag,omitempty"` // For hashtag facets
}
type BskyEmbed struct {
Type string `json:"$type"`
External *BskyExternal `json:"external,omitempty"`
Images []BskyImage `json:"images,omitempty"`
}
type BskyExternal struct {
URI string `json:"uri"`
Title string `json:"title"`
Description string `json:"description"`
Thumb *BlobRef `json:"thumb,omitempty"`
}
type BskyImage struct {
Alt string `json:"alt"`
Image *BlobRef `json:"image"`
AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"`
}
type BskyAspectRatio struct {
Width int `json:"width"`
Height int `json:"height"`
}
// NewPublisher creates a new Publisher instance
func NewPublisher(pdsHost string) *Publisher {
return &Publisher{
pdsHost: pdsHost,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// TID alphabet for base32-sortable encoding
const tidAlphabet = "234567abcdefghijklmnopqrstuvwxyz"
// GenerateRkey creates a deterministic TID-format rkey from a GUID and timestamp
// TIDs are required by Bluesky relay for indexing - custom rkeys don't sync
// Format: 13 chars base32-sortable, 53 bits timestamp + 10 bits clock ID
func GenerateRkey(guid string, timestamp time.Time) string {
if guid == "" {
return ""
}
// Get microseconds since Unix epoch (53 bits)
microsInt := timestamp.UnixMicro()
if microsInt < 0 {
microsInt = 0
}
// Convert to uint64 and mask to 53 bits
micros := uint64(microsInt) & ((1 << 53) - 1)
// Generate deterministic 10-bit clock ID from GUID hash
// Use XOR of multiple hash bytes to reduce collisions
hash := sha256.Sum256([]byte(guid))
// XOR bytes 0-3 together, then 4-7, combine for more entropy
h1 := uint64(hash[0]) ^ uint64(hash[2]) ^ uint64(hash[4]) ^ uint64(hash[6])
h2 := uint64(hash[1]) ^ uint64(hash[3]) ^ uint64(hash[5]) ^ uint64(hash[7])
clockID := (h1 << 2) | (h2 >> 6)
clockID = clockID & ((1 << 10) - 1) // 10 bits = 0-1023
// Combine: top bit 0, 53 bits timestamp, 10 bits clock ID
tid := (micros << 10) | clockID
// Encode as base32-sortable (13 characters)
var result [13]byte
for i := 12; i >= 0; i-- {
result[i] = tidAlphabet[tid&0x1f]
tid >>= 5
}
return string(result[:])
}
// extractURLs finds all URLs in a string
func extractURLs(text string) []string {
// Match http:// or https:// URLs
urlRegex := regexp.MustCompile(`https?://[^\s<>"'\)]+`)
matches := urlRegex.FindAllString(text, -1)
// Clean up trailing punctuation
var urls []string
for _, u := range matches {
// Remove trailing punctuation that's likely not part of the URL
u = strings.TrimRight(u, ".,;:!?")
if u != "" {
urls = append(urls, u)
}
}
return urls
}
// toCamelCaseTag converts a tag string to camelCase hashtag format
// e.g., "Lagos News" -> "lagosNews", "AI" -> "ai", "machine learning" -> "machineLearning"
func toCamelCaseTag(tag string) string {
tag = strings.TrimSpace(tag)
if tag == "" {
return ""
}
// Remove any # prefix if present
tag = strings.TrimPrefix(tag, "#")
// Split on spaces and other separators
words := strings.FieldsFunc(tag, func(r rune) bool {
return r == ' ' || r == '-' || r == '_'
})
if len(words) == 0 {
return ""
}
// If single word, return lowercased
if len(words) == 1 {
return strings.ToLower(words[0])
}
// Multiple words: lowercase first word, capitalize first letter of subsequent words
var result strings.Builder
for i, word := range words {
if word == "" {
continue
}
runes := []rune(word)
if len(runes) > 0 {
if i == 0 || result.Len() == 0 {
// First word: all lowercase
result.WriteString(strings.ToLower(word))
} else {
// Subsequent words: capitalize first letter, lowercase rest
result.WriteString(strings.ToUpper(string(runes[0])))
if len(runes) > 1 {
result.WriteString(strings.ToLower(string(runes[1:])))
}
}
}
}
return result.String()
}
// formatTagsForPost converts item tags to hashtag text and facets
// Returns the hashtag line (e.g., "#AI #MachineLearning #News") and facets
func formatTagsForPost(tags []string, textOffset int) (string, []BskyFacet) {
if len(tags) == 0 {
return "", nil
}
// Dedupe and convert tags
seen := make(map[string]bool)
var hashtags []string
for _, tag := range tags {
camel := toCamelCaseTag(tag)
if camel == "" || seen[strings.ToLower(camel)] {
continue
}
seen[strings.ToLower(camel)] = true
hashtags = append(hashtags, camel)
}
if len(hashtags) == 0 {
return "", nil
}
// Limit to 5 tags to keep post compact
if len(hashtags) > 5 {
hashtags = hashtags[:5]
}
// Build the hashtag line and facets
var line strings.Builder
var facets []BskyFacet
currentOffset := textOffset
for i, ht := range hashtags {
if i > 0 {
line.WriteString(" ")
currentOffset++
}
hashtagText := "#" + ht
byteStart := currentOffset
byteEnd := currentOffset + len(hashtagText)
line.WriteString(hashtagText)
facets = append(facets, BskyFacet{
Index: BskyByteSlice{
ByteStart: byteStart,
ByteEnd: byteEnd,
},
Features: []BskyFeature{{
Type: "app.bsky.richtext.facet#tag",
Tag: ht,
}},
})
currentOffset = byteEnd
}
return line.String(), facets
}
// PublishItem posts a feed item to the PDS
// Returns the AT URI of the created record, or error
func (p *Publisher) PublishItem(session *PDSSession, item *Item) (string, error) {
if item.GUID == "" && item.Link == "" {
return "", fmt.Errorf("item has no GUID or link, cannot publish")
}
// Collect URLs: main link + HN comments link (if applicable)
// Limit to 2 URLs max to stay under 300 grapheme limit
urlSet := make(map[string]bool)
var allURLs []string
// Add main link first
if item.Link != "" {
urlSet[item.Link] = true
allURLs = append(allURLs, item.Link)
}
// For HN feeds, add comments link from description (looks like "https://news.ycombinator.com/item?id=...")
descURLs := extractURLs(item.Description)
for _, u := range descURLs {
if strings.Contains(u, "news.ycombinator.com/item") && !urlSet[u] {
urlSet[u] = true
allURLs = append(allURLs, u)
break // Only add one comments link
}
}
// Add enclosure URL for podcasts/media (audio/video) if we have room
// Bluesky has 300 char limit, so only add if total URLs + minimal title fits
if len(allURLs) < 2 && item.Enclosure != nil && item.Enclosure.URL != "" {
encType := strings.ToLower(item.Enclosure.Type)
if strings.HasPrefix(encType, "audio/") || strings.HasPrefix(encType, "video/") {
if !urlSet[item.Enclosure.URL] {
// Calculate if enclosure would fit (need ~60 chars for title + separators)
currentURLLen := 0
for _, u := range allURLs {
currentURLLen += len(u) + 2 // +2 for \n\n
}
enclosureLen := len(item.Enclosure.URL) + 2
if currentURLLen+enclosureLen < 235 { // Leave 60 chars for title
urlSet[item.Enclosure.URL] = true
allURLs = append(allURLs, item.Enclosure.URL)
}
}
}
}
// Get the primary URL (article link)
primaryURL := ""
if len(allURLs) > 0 {
primaryURL = allURLs[0]
}
// Use original publication date if available, otherwise current time
createdAt := time.Now()
if !item.PubDate.IsZero() {
createdAt = item.PubDate
}
// Build post text with hashtags if available
// The link card shows the title, description, and thumbnail
// Clicking the card doesn't trigger the "leaving Bluesky" warning
postText := ""
var facets []BskyFacet
if len(item.Tags) > 0 {
tagLine, tagFacets := formatTagsForPost(item.Tags, 0)
postText = tagLine
facets = tagFacets
}
post := BskyPost{
Type: "app.bsky.feed.post",
Text: postText,
CreatedAt: createdAt.Format(time.RFC3339),
Facets: facets,
}
// Always use external embed (link card) - clicking the card doesn't show "leaving" warning
if primaryURL != "" {
external := &BskyExternal{
URI: primaryURL,
Title: item.Title,
Description: truncate(stripHTML(item.Description), 300),
}
// Add thumbnail from first image if available
if len(item.ImageURLs) > 0 {
if thumb := p.fetchAndUploadImage(session, item.ImageURLs[0]); thumb != nil {
external.Thumb = thumb
}
}
post.Embed = &BskyEmbed{
Type: "app.bsky.embed.external",
External: external,
}
}
// Use GUID + discoveredAt for deterministic rkey
// This allows regenerating a new rkey by updating discoveredAt if needed
guidForRkey := item.GUID
if guidForRkey == "" {
guidForRkey = item.Link
}
// Use PubDate for rkey to match createdAt ordering, fall back to DiscoveredAt
rkeyTime := item.PubDate
if rkeyTime.IsZero() {
rkeyTime = item.DiscoveredAt
}
rkey := GenerateRkey(guidForRkey, rkeyTime)
// Create the record with deterministic rkey
payload := map[string]interface{}{
"repo": session.DID,
"collection": "app.bsky.feed.post",
"rkey": rkey,
"record": post,
}
body, err := json.Marshal(payload)
if err != nil {
return "", err
}
req, err := http.NewRequest("POST", p.pdsHost+"/xrpc/com.atproto.repo.createRecord", bytes.NewReader(body))
if err != nil {
return "", err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+session.AccessJwt)
resp, err := p.httpClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("create record failed: %s - %s", resp.Status, string(respBody))
}
var result struct {
URI string `json:"uri"`
CID string `json:"cid"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return "", err
}
return result.URI, nil
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen-3] + "..."
}
// stripHTML removes HTML tags from a string
func stripHTML(s string) string {
// Remove HTML tags
tagRegex := regexp.MustCompile(`<[^>]*>`)
s = tagRegex.ReplaceAllString(s, "")
// Decode common HTML entities
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", "\"")
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
// Collapse whitespace
spaceRegex := regexp.MustCompile(`\s+`)
s = spaceRegex.ReplaceAllString(s, " ")
return strings.TrimSpace(s)
}