Normalize guessed languages for code highlighting (#36450)

For when Enry correctly recognized the language, but returns the
language name in a way that isn't recognized by chroma.

Resolves https://github.com/go-gitea/gitea/issues/22443

---------

Co-authored-by: Moritz Jörg <moritz.jorg@oceanbox.io>
Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
This commit is contained in:
Moritz Jörg
2026-01-25 15:02:16 +01:00
committed by GitHub
parent 12a81d38c1
commit 89bfddc5c2
10 changed files with 148 additions and 141 deletions
+3 -2
View File
@@ -2485,8 +2485,9 @@ LEVEL = Info
;[highlight.mapping] ;[highlight.mapping]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Extension mapping to highlight class ;; Extension mapping to highlight class, for example:
;; e.g. .toml=ini ;; .toml = ini
;; .my-js = JavaScript
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+3 -2
View File
@@ -4,12 +4,13 @@
package analyze package analyze
import ( import (
"path/filepath" "path"
"github.com/go-enry/go-enry/v2" "github.com/go-enry/go-enry/v2"
) )
// GetCodeLanguage detects code language based on file name and content // GetCodeLanguage detects code language based on file name and content
// It can be slow when the content is used for detection
func GetCodeLanguage(filename string, content []byte) string { func GetCodeLanguage(filename string, content []byte) string {
if language, ok := enry.GetLanguageByExtension(filename); ok { if language, ok := enry.GetLanguageByExtension(filename); ok {
return language return language
@@ -23,5 +24,5 @@ func GetCodeLanguage(filename string, content []byte) string {
return enry.OtherLanguage return enry.OtherLanguage
} }
return enry.GetLanguage(filepath.Base(filename), content) return enry.GetLanguage(path.Base(filename), content)
} }
+91 -95
View File
@@ -12,7 +12,6 @@ import (
"html/template" "html/template"
"io" "io"
"path" "path"
"path/filepath"
"strings" "strings"
"sync" "sync"
@@ -25,35 +24,32 @@ import (
"github.com/alecthomas/chroma/v2/formatters/html" "github.com/alecthomas/chroma/v2/formatters/html"
"github.com/alecthomas/chroma/v2/lexers" "github.com/alecthomas/chroma/v2/lexers"
"github.com/alecthomas/chroma/v2/styles" "github.com/alecthomas/chroma/v2/styles"
lru "github.com/hashicorp/golang-lru/v2" "github.com/go-enry/go-enry/v2"
) )
// don't index files larger than this many bytes for performance purposes // don't index files larger than this many bytes for performance purposes
const sizeLimit = 1024 * 1024 const sizeLimit = 1024 * 1024
type globalVarsType struct {
highlightMapping map[string]string
githubStyles *chroma.Style
}
var ( var (
// For custom user mapping globalVarsMu sync.Mutex
highlightMapping = map[string]string{} globalVarsPtr *globalVarsType
once sync.Once
cache *lru.TwoQueueCache[string, any]
githubStyles = styles.Get("github")
) )
// NewContext loads custom highlight map from local config func globalVars() *globalVarsType {
func NewContext() { // in the future, the globalVars might need to be re-initialized when settings change, so don't use sync.Once here
once.Do(func() { globalVarsMu.Lock()
highlightMapping = setting.GetHighlightMapping() defer globalVarsMu.Unlock()
if globalVarsPtr == nil {
// The size 512 is simply a conservative rule of thumb globalVarsPtr = &globalVarsType{}
c, err := lru.New2Q[string, any](512) globalVarsPtr.githubStyles = styles.Get("github")
if err != nil { globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
panic(fmt.Sprintf("failed to initialize LRU cache for highlighter: %s", err)) }
} return globalVarsPtr
cache = c
})
} }
// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags // UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
@@ -88,10 +84,56 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) {
} }
} }
// Code returns an HTML version of code string with chroma syntax highlighting classes and the matched lexer name func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer {
func Code(fileName, language, code string) (output template.HTML, lexerName string) { lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters?
NewContext() ext := path.Ext(fileName)
// the "lang" might come from enry, it has different naming for some languages
switch lang {
case "F#":
lang = "FSharp"
case "Pascal":
lang = "ObjectPascal"
case "C":
if ext == ".C" || ext == ".H" {
lang = "C++"
}
}
// lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers
return lexers.Get(lang)
}
// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional.
// When code content is provided, it will be slow if no lexer is found by file name or language.
// If no lexer is found, it will return the fallback lexer.
func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) {
if lang != "" {
lexer = getChromaLexerByLanguage(fileName, lang)
}
if lexer == nil {
fileExt := path.Ext(fileName)
if val, ok := globalVars().highlightMapping[fileExt]; ok {
lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer
}
}
if lexer == nil {
// when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content
// this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages
enryLanguage := analyze.GetCodeLanguage(fileName, code)
lexer = getChromaLexerByLanguage(fileName, enryLanguage)
if lexer == nil {
if enryLanguage != enry.OtherLanguage {
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName)
}
lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname
}
}
return util.IfZero(lexer, lexers.Fallback)
}
func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) {
// diff view newline will be passed as empty, change to literal '\n' so it can be copied // diff view newline will be passed as empty, change to literal '\n' so it can be copied
// preserve literal newline in blame view // preserve literal newline in blame view
if code == "" || code == "\n" { if code == "" || code == "\n" {
@@ -102,45 +144,25 @@ func Code(fileName, language, code string) (output template.HTML, lexerName stri
return template.HTML(template.HTMLEscapeString(code)), "" return template.HTML(template.HTMLEscapeString(code)), ""
} }
var lexer chroma.Lexer var codeForGuessLexer []byte
if slowGuess {
if len(language) > 0 { // it is slower to guess lexer by code content, so only do it when necessary
lexer = lexers.Get(language) codeForGuessLexer = util.UnsafeStringToBytes(code)
if lexer == nil {
// Attempt stripping off the '?'
if before, _, ok := strings.Cut(language, "?"); ok {
lexer = lexers.Get(before)
}
}
} }
lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer)
if lexer == nil { return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name)
if val, ok := highlightMapping[path.Ext(fileName)]; ok {
// use mapped value to find lexer
lexer = lexers.Get(val)
}
}
if lexer == nil {
if l, ok := cache.Get(fileName); ok {
lexer = l.(chroma.Lexer)
}
}
if lexer == nil {
lexer = lexers.Match(fileName)
if lexer == nil {
lexer = lexers.Fallback
}
cache.Add(fileName, lexer)
}
return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name)
} }
// CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) {
func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML { return renderCode(fileName, language, code, false)
}
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) {
return renderCode(fileName, language, code, true)
}
// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes
func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
formatter := html.New(html.WithClasses(true), formatter := html.New(html.WithClasses(true),
html.WithLineNumbers(false), html.WithLineNumbers(false),
html.PreventSurroundingPre(true), html.PreventSurroundingPre(true),
@@ -155,7 +177,7 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
return template.HTML(template.HTMLEscapeString(code)) return template.HTML(template.HTMLEscapeString(code))
} }
// style not used for live site but need to pass something // style not used for live site but need to pass something
err = formatter.Format(htmlw, githubStyles, iterator) err = formatter.Format(htmlw, globalVars().githubStyles, iterator)
if err != nil { if err != nil {
log.Error("Can't format code: %v", err) log.Error("Can't format code: %v", err)
return template.HTML(template.HTMLEscapeString(code)) return template.HTML(template.HTMLEscapeString(code))
@@ -167,12 +189,10 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n")) return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
} }
// File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name // RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
func File(fileName, language string, code []byte) ([]template.HTML, string, error) { func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) {
NewContext()
if len(code) > sizeLimit { if len(code) > sizeLimit {
return PlainText(code), "", nil return RenderPlainText(code), "", nil
} }
formatter := html.New(html.WithClasses(true), formatter := html.New(html.WithClasses(true),
@@ -180,31 +200,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
html.PreventSurroundingPre(true), html.PreventSurroundingPre(true),
) )
var lexer chroma.Lexer lexer := GetChromaLexerWithFallback(fileName, language, code)
// provided language overrides everything
if language != "" {
lexer = lexers.Get(language)
}
if lexer == nil {
if val, ok := highlightMapping[filepath.Ext(fileName)]; ok {
lexer = lexers.Get(val)
}
}
if lexer == nil {
guessLanguage := analyze.GetCodeLanguage(fileName, code)
lexer = lexers.Get(guessLanguage)
if lexer == nil {
lexer = lexers.Match(fileName)
if lexer == nil {
lexer = lexers.Fallback
}
}
}
lexerName := formatLexerName(lexer.Config().Name) lexerName := formatLexerName(lexer.Config().Name)
iterator, err := lexer.Tokenise(nil, string(code)) iterator, err := lexer.Tokenise(nil, string(code))
@@ -218,7 +214,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
lines := make([]template.HTML, 0, len(tokensLines)) lines := make([]template.HTML, 0, len(tokensLines))
for _, tokens := range tokensLines { for _, tokens := range tokensLines {
iterator = chroma.Literator(tokens...) iterator = chroma.Literator(tokens...)
err = formatter.Format(htmlBuf, githubStyles, iterator) err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
if err != nil { if err != nil {
return nil, "", fmt.Errorf("can't format code: %w", err) return nil, "", fmt.Errorf("can't format code: %w", err)
} }
@@ -229,8 +225,8 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
return lines, lexerName, nil return lines, lexerName, nil
} }
// PlainText returns non-highlighted HTML for code // RenderPlainText returns non-highlighted HTML for code
func PlainText(code []byte) []template.HTML { func RenderPlainText(code []byte) []template.HTML {
r := bufio.NewReader(bytes.NewReader(code)) r := bufio.NewReader(bytes.NewReader(code))
m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1) m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
for { for {
+35 -2
View File
@@ -112,7 +112,7 @@ c=2
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
out, lexerName, err := File(tt.name, "", []byte(tt.code)) out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code))
assert.NoError(t, err) assert.NoError(t, err)
assert.Equal(t, tt.want, out) assert.Equal(t, tt.want, out)
assert.Equal(t, tt.lexerName, lexerName) assert.Equal(t, tt.lexerName, lexerName)
@@ -176,7 +176,7 @@ c=2`),
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
out := PlainText([]byte(tt.code)) out := RenderPlainText([]byte(tt.code))
assert.Equal(t, tt.want, out) assert.Equal(t, tt.want, out)
}) })
} }
@@ -199,3 +199,36 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
assert.Equal(t, "<span>a</span>\n", string(ret[0])) assert.Equal(t, "<span>a</span>\n", string(ret[0]))
assert.Equal(t, "<span>b\n</span>", string(ret[1])) assert.Equal(t, "<span>b\n</span>", string(ret[1]))
} }
func TestGetChromaLexer(t *testing.T) {
globalVars().highlightMapping[".my-html"] = "HTML"
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
cases := []struct {
fileName string
language string
content string
expected string
}{
{"test.py", "", "", "Python"},
{"any-file", "javascript", "", "JavaScript"},
{"any-file", "", "/* vim: set filetype=python */", "Python"},
{"any-file", "", "", "fallback"},
{"test.fs", "", "", "Forth"},
{"test.fs", "F#", "", "FSharp"},
{"test.fs", "", "let x = 1", "FSharp"},
{"test.c", "", "", "C"},
{"test.C", "", "", "C++"},
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
{"test.my-html", "", "", "HTML"},
}
for _, c := range cases {
lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content))
if assert.NotNil(t, lexer, "case: %+v", c) {
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
}
}
}
+2 -2
View File
@@ -72,10 +72,10 @@ func writeStrings(buf *bytes.Buffer, strs ...string) error {
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine { func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting // we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
hl, _ := highlight.Code(filename, language, code) hl, _ := highlight.RenderCodeFast(filename, language, code)
highlightedLines := strings.Split(string(hl), "\n") highlightedLines := strings.Split(string(hl), "\n")
// The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n` // The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums))) lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums)))
for i := range lines { for i := range lines {
lines[i] = &ResultLine{ lines[i] = &ResultLine{
+8 -30
View File
@@ -5,7 +5,6 @@ package orgmode
import ( import (
"fmt" "fmt"
"html"
"html/template" "html/template"
"io" "io"
"strings" "strings"
@@ -17,7 +16,6 @@ import (
"code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/setting"
"github.com/alecthomas/chroma/v2" "github.com/alecthomas/chroma/v2"
"github.com/alecthomas/chroma/v2/lexers"
"github.com/niklasfasching/go-org/org" "github.com/niklasfasching/go-org/org"
) )
@@ -57,40 +55,20 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error
htmlWriter.HighlightCodeBlock = func(source, lang string, inline bool, params map[string]string) string { htmlWriter.HighlightCodeBlock = func(source, lang string, inline bool, params map[string]string) string {
defer func() { defer func() {
if err := recover(); err != nil { if err := recover(); err != nil {
// catch the panic, log the error and return empty result
log.Error("Panic in HighlightCodeBlock: %v\n%s", err, log.Stack(2)) log.Error("Panic in HighlightCodeBlock: %v\n%s", err, log.Stack(2))
panic(err)
} }
}() }()
w := &strings.Builder{}
lexer := lexers.Get(lang) lexer := highlight.GetChromaLexerWithFallback("", lang, nil) // don't use content to detect, it is too slow
if lexer == nil && lang == "" { lexer = chroma.Coalesce(lexer)
lexer = lexers.Analyse(source)
if lexer == nil {
lexer = lexers.Fallback
}
lang = strings.ToLower(lexer.Config().Name)
}
sb := &strings.Builder{}
// include language-x class as part of commonmark spec // include language-x class as part of commonmark spec
if err := ctx.RenderInternal.FormatWithSafeAttrs(w, `<pre><code class="chroma language-%s">`, lang); err != nil { _ = ctx.RenderInternal.FormatWithSafeAttrs(sb, `<pre><code class="chroma language-%s">`, strings.ToLower(lexer.Config().Name))
return "" _, _ = sb.WriteString(string(highlight.RenderCodeByLexer(lexer, source)))
} _, _ = sb.WriteString("</code></pre>")
if lexer == nil { return sb.String()
if _, err := w.WriteString(html.EscapeString(source)); err != nil {
return ""
}
} else {
lexer = chroma.Coalesce(lexer)
if _, err := w.WriteString(string(highlight.CodeFromLexer(lexer, source))); err != nil {
return ""
}
}
if _, err := w.WriteString("</code></pre>"); err != nil {
return ""
}
return w.String()
} }
w := &orgWriter{rctx: ctx, HTMLWriter: htmlWriter} w := &orgWriter{rctx: ctx, HTMLWriter: htmlWriter}
-2
View File
@@ -15,7 +15,6 @@ import (
"code.gitea.io/gitea/modules/eventsource" "code.gitea.io/gitea/modules/eventsource"
"code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/git/gitcmd" "code.gitea.io/gitea/modules/git/gitcmd"
"code.gitea.io/gitea/modules/highlight"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup" "code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/markup/external" "code.gitea.io/gitea/modules/markup/external"
@@ -131,7 +130,6 @@ func InitWebInstalled(ctx context.Context) {
mustInit(uinotification.Init) mustInit(uinotification.Init)
mustInitCtx(ctx, archiver.Init) mustInitCtx(ctx, archiver.Init)
highlight.NewContext()
external.RegisterRenderers() external.RegisterRenderers()
markup.Init(markup_service.FormalRenderHelperFuncs()) markup.Init(markup_service.FormalRenderHelperFuncs())
+1 -1
View File
@@ -267,7 +267,7 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa
bufContent := buf.Bytes() bufContent := buf.Bytes()
bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{}) bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{})
highlighted, lexerName := highlight.Code(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent)) highlighted, lexerName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent))
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted)
for i, br := range rows { for i, br := range rows {
var line template.HTML var line template.HTML
+3 -3
View File
@@ -124,11 +124,11 @@ func handleFileViewRenderSource(ctx *context.Context, filename string, attrs *at
} }
language := attrs.GetLanguage().Value() language := attrs.GetLanguage().Value()
fileContent, lexerName, err := highlight.File(filename, language, buf) fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf)
ctx.Data["LexerName"] = lexerName ctx.Data["LexerName"] = lexerName
if err != nil { if err != nil {
log.Error("highlight.File failed, fallback to plain text: %v", err) log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err)
fileContent = highlight.PlainText(buf) fileContent = highlight.RenderPlainText(buf)
} }
status := &charset.EscapeStatus{} status := &charset.EscapeStatus{}
statuses := make([]*charset.EscapeStatus, len(fileContent)) statuses := make([]*charset.EscapeStatus, len(fileContent))
+2 -2
View File
@@ -331,7 +331,7 @@ func (diffSection *DiffSection) getLineContentForRender(lineIdx int, diffLine *D
if setting.Git.DisableDiffHighlight { if setting.Git.DisableDiffHighlight {
return template.HTML(html.EscapeString(diffLine.Content[1:])) return template.HTML(html.EscapeString(diffLine.Content[1:]))
} }
h, _ = highlight.Code(diffSection.FileName, fileLanguage, diffLine.Content[1:]) h, _ = highlight.RenderCodeFast(diffSection.FileName, fileLanguage, diffLine.Content[1:])
return h return h
} }
@@ -1349,7 +1349,7 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit
func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML {
content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{}))
highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) highlightedNewContent, _ := highlight.RenderCodeFast(diffFile.Name, diffFile.Language, content)
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent)
lines := make(map[int]template.HTML, len(unsafeLines)) lines := make(map[int]template.HTML, len(unsafeLines))
// only save the highlighted lines we need, but not the whole file, to save memory // only save the highlighted lines we need, but not the whole file, to save memory