From a1f02cd0bc35acd6857d78341968eaa8b4be4fc7 Mon Sep 17 00:00:00 2001 From: primal Date: Wed, 28 Jan 2026 21:24:35 -0500 Subject: [PATCH] Fix image embeds and rkey collisions - Add image_urls to GetAllUnpublishedItems query - Add aspectRatio to image embeds (required by Bluesky) - Add image decoding to get dimensions (width/height) - Fix rkey collision by using XOR of multiple hash bytes The rkey collision was caused by using only 2 hash bytes (10 bits) which had ~0.1% collision rate per pair of items with same timestamp. Now XORs 8 hash bytes for better entropy distribution. Co-Authored-By: Claude Opus 4.5 --- crawler.go | 12 +++++++--- publisher.go | 67 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/crawler.go b/crawler.go index 013e0f4..90776f2 100644 --- a/crawler.go +++ b/crawler.go @@ -1,6 +1,7 @@ package main import ( + "encoding/json" "fmt" "io" "net/http" @@ -371,7 +372,7 @@ func (c *Crawler) RefreshAllProfiles(publisher *Publisher, feedPassword string) func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { rows, err := c.db.Query(` SELECT i.id, i.feed_url, i.guid, i.title, i.link, i.description, i.content, - i.author, i.pub_date, i.discovered_at + i.author, i.pub_date, i.discovered_at, i.image_urls FROM items i JOIN feeds f ON i.feed_url = f.url WHERE f.publish_status = 'pass' @@ -388,11 +389,11 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { var items []Item for rows.Next() { var item Item - var guid, title, link, description, content, author *string + var guid, title, link, description, content, author, imageURLsJSON *string var pubDate, discoveredAt *time.Time err := rows.Scan(&item.ID, &item.FeedURL, &guid, &title, &link, &description, - &content, &author, &pubDate, &discoveredAt) + &content, &author, &pubDate, &discoveredAt, &imageURLsJSON) if err != nil { continue } @@ -406,6 +407,11 @@ func (c *Crawler) GetAllUnpublishedItems(limit int) ([]Item, error) { item.PubDate = TimeValue(pubDate) item.DiscoveredAt = TimeValue(discoveredAt) + // Parse image URLs from JSON array + if imageURLsJSON != nil && *imageURLsJSON != "" { + json.Unmarshal([]byte(*imageURLsJSON), &item.ImageURLs) + } + items = append(items, item) } diff --git a/publisher.go b/publisher.go index 595bd76..eea7585 100644 --- a/publisher.go +++ b/publisher.go @@ -5,6 +5,10 @@ import ( "crypto/sha256" "encoding/json" "fmt" + "image" + _ "image/gif" + _ "image/jpeg" + _ "image/png" "io" "net/http" "net/url" @@ -12,6 +16,8 @@ import ( "strings" "time" "unicode/utf8" + + _ "golang.org/x/image/webp" ) // Publisher handles posting items to AT Protocol PDS @@ -66,8 +72,14 @@ type BskyExternal struct { } type BskyImage struct { - Alt string `json:"alt"` - Image *BlobRef `json:"image"` + Alt string `json:"alt"` + Image *BlobRef `json:"image"` + AspectRatio *BskyAspectRatio `json:"aspectRatio,omitempty"` +} + +type BskyAspectRatio struct { + Width int `json:"width"` + Height int `json:"height"` } // NewPublisher creates a new Publisher instance @@ -216,8 +228,12 @@ func GenerateRkey(guid string, timestamp time.Time) string { micros := uint64(microsInt) & ((1 << 53) - 1) // Generate deterministic 10-bit clock ID from GUID hash + // Use XOR of multiple hash bytes to reduce collisions hash := sha256.Sum256([]byte(guid)) - clockID := uint64(hash[0])<<2 | uint64(hash[1])>>6 + // XOR bytes 0-3 together, then 4-7, combine for more entropy + h1 := uint64(hash[0]) ^ uint64(hash[2]) ^ uint64(hash[4]) ^ uint64(hash[6]) + h2 := uint64(hash[1]) ^ uint64(hash[3]) ^ uint64(hash[5]) ^ uint64(hash[7]) + clockID := (h1 << 2) | (h2 >> 6) clockID = clockID & ((1 << 10) - 1) // 10 bits = 0-1023 // Combine: top bit 0, 53 bits timestamp, 10 bits clock ID @@ -460,12 +476,19 @@ func (p *Publisher) uploadImages(session *PDSSession, imageURLs []string, altTex } for i := 0; i < maxImages; i++ { - blob := p.fetchAndUploadImage(session, imageURLs[i]) - if blob != nil { - images = append(images, BskyImage{ + result := p.fetchAndUploadImageWithDimensions(session, imageURLs[i]) + if result != nil && result.Blob != nil { + img := BskyImage{ Alt: altText, - Image: blob, - }) + Image: result.Blob, + } + if result.Width > 0 && result.Height > 0 { + img.AspectRatio = &BskyAspectRatio{ + Width: result.Width, + Height: result.Height, + } + } + images = append(images, img) } } @@ -514,7 +537,22 @@ func (p *Publisher) FetchFavicon(siteURL string) string { return fmt.Sprintf("https://www.google.com/s2/favicons?domain=%s&sz=128", u.Host) } +// ImageUploadResult contains the uploaded blob and image dimensions +type ImageUploadResult struct { + Blob *BlobRef + Width int + Height int +} + func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *BlobRef { + result := p.fetchAndUploadImageWithDimensions(session, imageURL) + if result == nil { + return nil + } + return result.Blob +} + +func (p *Publisher) fetchAndUploadImageWithDimensions(session *PDSSession, imageURL string) *ImageUploadResult { // Fetch the image resp, err := p.httpClient.Get(imageURL) if err != nil { @@ -552,13 +590,24 @@ func (p *Publisher) fetchAndUploadImage(session *PDSSession, imageURL string) *B return nil } + // Decode image to get dimensions + img, _, err := image.DecodeConfig(bytes.NewReader(data)) + width, height := 1, 1 // Default if decode fails + if err == nil { + width, height = img.Width, img.Height + } + // Upload to PDS blob, err := p.UploadBlob(session, data, contentType) if err != nil { return nil } - return blob + return &ImageUploadResult{ + Blob: blob, + Width: width, + Height: height, + } } func truncate(s string, maxLen int) string {