Normalize guessed languages for code highlighting (#36450)

For when Enry correctly recognized the language, but returns the
language name in a way that isn't recognized by chroma.

Resolves https://github.com/go-gitea/gitea/issues/22443

---------

Co-authored-by: Moritz Jörg <moritz.jorg@oceanbox.io>
Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
This commit is contained in:
Moritz Jörg
2026-01-25 15:02:16 +01:00
committed by GitHub
parent 12a81d38c1
commit 89bfddc5c2
10 changed files with 148 additions and 141 deletions
+3 -2
View File
@@ -2485,8 +2485,9 @@ LEVEL = Info
;[highlight.mapping]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Extension mapping to highlight class
;; e.g. .toml=ini
;; Extension mapping to highlight class, for example:
;; .toml = ini
;; .my-js = JavaScript
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+3 -2
View File
@@ -4,12 +4,13 @@
package analyze
import (
"path/filepath"
"path"
"github.com/go-enry/go-enry/v2"
)
// GetCodeLanguage detects code language based on file name and content
// It can be slow when the content is used for detection
func GetCodeLanguage(filename string, content []byte) string {
if language, ok := enry.GetLanguageByExtension(filename); ok {
return language
@@ -23,5 +24,5 @@ func GetCodeLanguage(filename string, content []byte) string {
return enry.OtherLanguage
}
return enry.GetLanguage(filepath.Base(filename), content)
return enry.GetLanguage(path.Base(filename), content)
}
+91 -95
View File
@@ -12,7 +12,6 @@ import (
"html/template"
"io"
"path"
"path/filepath"
"strings"
"sync"
@@ -25,35 +24,32 @@ import (
"github.com/alecthomas/chroma/v2/formatters/html"
"github.com/alecthomas/chroma/v2/lexers"
"github.com/alecthomas/chroma/v2/styles"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/go-enry/go-enry/v2"
)
// don't index files larger than this many bytes for performance purposes
const sizeLimit = 1024 * 1024
type globalVarsType struct {
highlightMapping map[string]string
githubStyles *chroma.Style
}
var (
// For custom user mapping
highlightMapping = map[string]string{}
once sync.Once
cache *lru.TwoQueueCache[string, any]
githubStyles = styles.Get("github")
globalVarsMu sync.Mutex
globalVarsPtr *globalVarsType
)
// NewContext loads custom highlight map from local config
func NewContext() {
once.Do(func() {
highlightMapping = setting.GetHighlightMapping()
// The size 512 is simply a conservative rule of thumb
c, err := lru.New2Q[string, any](512)
if err != nil {
panic(fmt.Sprintf("failed to initialize LRU cache for highlighter: %s", err))
}
cache = c
})
func globalVars() *globalVarsType {
// in the future, the globalVars might need to be re-initialized when settings change, so don't use sync.Once here
globalVarsMu.Lock()
defer globalVarsMu.Unlock()
if globalVarsPtr == nil {
globalVarsPtr = &globalVarsType{}
globalVarsPtr.githubStyles = styles.Get("github")
globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
}
return globalVarsPtr
}
// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
@@ -88,10 +84,56 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) {
}
}
// Code returns an HTML version of code string with chroma syntax highlighting classes and the matched lexer name
func Code(fileName, language, code string) (output template.HTML, lexerName string) {
NewContext()
func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer {
lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters?
ext := path.Ext(fileName)
// the "lang" might come from enry, it has different naming for some languages
switch lang {
case "F#":
lang = "FSharp"
case "Pascal":
lang = "ObjectPascal"
case "C":
if ext == ".C" || ext == ".H" {
lang = "C++"
}
}
// lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers
return lexers.Get(lang)
}
// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional.
// When code content is provided, it will be slow if no lexer is found by file name or language.
// If no lexer is found, it will return the fallback lexer.
func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) {
if lang != "" {
lexer = getChromaLexerByLanguage(fileName, lang)
}
if lexer == nil {
fileExt := path.Ext(fileName)
if val, ok := globalVars().highlightMapping[fileExt]; ok {
lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer
}
}
if lexer == nil {
// when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content
// this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages
enryLanguage := analyze.GetCodeLanguage(fileName, code)
lexer = getChromaLexerByLanguage(fileName, enryLanguage)
if lexer == nil {
if enryLanguage != enry.OtherLanguage {
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName)
}
lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname
}
}
return util.IfZero(lexer, lexers.Fallback)
}
func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) {
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
// preserve literal newline in blame view
if code == "" || code == "\n" {
@@ -102,45 +144,25 @@ func Code(fileName, language, code string) (output template.HTML, lexerName stri
return template.HTML(template.HTMLEscapeString(code)), ""
}
var lexer chroma.Lexer
if len(language) > 0 {
lexer = lexers.Get(language)
if lexer == nil {
// Attempt stripping off the '?'
if before, _, ok := strings.Cut(language, "?"); ok {
lexer = lexers.Get(before)
}
}
var codeForGuessLexer []byte
if slowGuess {
// it is slower to guess lexer by code content, so only do it when necessary
codeForGuessLexer = util.UnsafeStringToBytes(code)
}
if lexer == nil {
if val, ok := highlightMapping[path.Ext(fileName)]; ok {
// use mapped value to find lexer
lexer = lexers.Get(val)
}
}
if lexer == nil {
if l, ok := cache.Get(fileName); ok {
lexer = l.(chroma.Lexer)
}
}
if lexer == nil {
lexer = lexers.Match(fileName)
if lexer == nil {
lexer = lexers.Fallback
}
cache.Add(fileName, lexer)
}
return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name)
lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer)
return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name)
}
// CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes
func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) {
return renderCode(fileName, language, code, false)
}
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) {
return renderCode(fileName, language, code, true)
}
// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes
func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
formatter := html.New(html.WithClasses(true),
html.WithLineNumbers(false),
html.PreventSurroundingPre(true),
@@ -155,7 +177,7 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
return template.HTML(template.HTMLEscapeString(code))
}
// style not used for live site but need to pass something
err = formatter.Format(htmlw, githubStyles, iterator)
err = formatter.Format(htmlw, globalVars().githubStyles, iterator)
if err != nil {
log.Error("Can't format code: %v", err)
return template.HTML(template.HTMLEscapeString(code))
@@ -167,12 +189,10 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
}
// File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
func File(fileName, language string, code []byte) ([]template.HTML, string, error) {
NewContext()
// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) {
if len(code) > sizeLimit {
return PlainText(code), "", nil
return RenderPlainText(code), "", nil
}
formatter := html.New(html.WithClasses(true),
@@ -180,31 +200,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
html.PreventSurroundingPre(true),
)
var lexer chroma.Lexer
// provided language overrides everything
if language != "" {
lexer = lexers.Get(language)
}
if lexer == nil {
if val, ok := highlightMapping[filepath.Ext(fileName)]; ok {
lexer = lexers.Get(val)
}
}
if lexer == nil {
guessLanguage := analyze.GetCodeLanguage(fileName, code)
lexer = lexers.Get(guessLanguage)
if lexer == nil {
lexer = lexers.Match(fileName)
if lexer == nil {
lexer = lexers.Fallback
}
}
}
lexer := GetChromaLexerWithFallback(fileName, language, code)
lexerName := formatLexerName(lexer.Config().Name)
iterator, err := lexer.Tokenise(nil, string(code))
@@ -218,7 +214,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
lines := make([]template.HTML, 0, len(tokensLines))
for _, tokens := range tokensLines {
iterator = chroma.Literator(tokens...)
err = formatter.Format(htmlBuf, githubStyles, iterator)
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
if err != nil {
return nil, "", fmt.Errorf("can't format code: %w", err)
}
@@ -229,8 +225,8 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro
return lines, lexerName, nil
}
// PlainText returns non-highlighted HTML for code
func PlainText(code []byte) []template.HTML {
// RenderPlainText returns non-highlighted HTML for code
func RenderPlainText(code []byte) []template.HTML {
r := bufio.NewReader(bytes.NewReader(code))
m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
for {
+35 -2
View File
@@ -112,7 +112,7 @@ c=2
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
out, lexerName, err := File(tt.name, "", []byte(tt.code))
out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code))
assert.NoError(t, err)
assert.Equal(t, tt.want, out)
assert.Equal(t, tt.lexerName, lexerName)
@@ -176,7 +176,7 @@ c=2`),
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
out := PlainText([]byte(tt.code))
out := RenderPlainText([]byte(tt.code))
assert.Equal(t, tt.want, out)
})
}
@@ -199,3 +199,36 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
}
func TestGetChromaLexer(t *testing.T) {
globalVars().highlightMapping[".my-html"] = "HTML"
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
cases := []struct {
fileName string
language string
content string
expected string
}{
{"test.py", "", "", "Python"},
{"any-file", "javascript", "", "JavaScript"},
{"any-file", "", "/* vim: set filetype=python */", "Python"},
{"any-file", "", "", "fallback"},
{"test.fs", "", "", "Forth"},
{"test.fs", "F#", "", "FSharp"},
{"test.fs", "", "let x = 1", "FSharp"},
{"test.c", "", "", "C"},
{"test.C", "", "", "C++"},
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
{"test.my-html", "", "", "HTML"},
}
for _, c := range cases {
lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content))
if assert.NotNil(t, lexer, "case: %+v", c) {
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
}
}
}
+2 -2
View File
@@ -72,10 +72,10 @@ func writeStrings(buf *bytes.Buffer, strs ...string) error {
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
hl, _ := highlight.Code(filename, language, code)
hl, _ := highlight.RenderCodeFast(filename, language, code)
highlightedLines := strings.Split(string(hl), "\n")
// The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n`
// The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums)))
for i := range lines {
lines[i] = &ResultLine{
+8 -30
View File
@@ -5,7 +5,6 @@ package orgmode
import (
"fmt"
"html"
"html/template"
"io"
"strings"
@@ -17,7 +16,6 @@ import (
"code.gitea.io/gitea/modules/setting"
"github.com/alecthomas/chroma/v2"
"github.com/alecthomas/chroma/v2/lexers"
"github.com/niklasfasching/go-org/org"
)
@@ -57,40 +55,20 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error
htmlWriter.HighlightCodeBlock = func(source, lang string, inline bool, params map[string]string) string {
defer func() {
if err := recover(); err != nil {
// catch the panic, log the error and return empty result
log.Error("Panic in HighlightCodeBlock: %v\n%s", err, log.Stack(2))
panic(err)
}
}()
w := &strings.Builder{}
lexer := lexers.Get(lang)
if lexer == nil && lang == "" {
lexer = lexers.Analyse(source)
if lexer == nil {
lexer = lexers.Fallback
}
lang = strings.ToLower(lexer.Config().Name)
}
lexer := highlight.GetChromaLexerWithFallback("", lang, nil) // don't use content to detect, it is too slow
lexer = chroma.Coalesce(lexer)
sb := &strings.Builder{}
// include language-x class as part of commonmark spec
if err := ctx.RenderInternal.FormatWithSafeAttrs(w, `<pre><code class="chroma language-%s">`, lang); err != nil {
return ""
}
if lexer == nil {
if _, err := w.WriteString(html.EscapeString(source)); err != nil {
return ""
}
} else {
lexer = chroma.Coalesce(lexer)
if _, err := w.WriteString(string(highlight.CodeFromLexer(lexer, source))); err != nil {
return ""
}
}
if _, err := w.WriteString("</code></pre>"); err != nil {
return ""
}
return w.String()
_ = ctx.RenderInternal.FormatWithSafeAttrs(sb, `<pre><code class="chroma language-%s">`, strings.ToLower(lexer.Config().Name))
_, _ = sb.WriteString(string(highlight.RenderCodeByLexer(lexer, source)))
_, _ = sb.WriteString("</code></pre>")
return sb.String()
}
w := &orgWriter{rctx: ctx, HTMLWriter: htmlWriter}
-2
View File
@@ -15,7 +15,6 @@ import (
"code.gitea.io/gitea/modules/eventsource"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/git/gitcmd"
"code.gitea.io/gitea/modules/highlight"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/markup/external"
@@ -131,7 +130,6 @@ func InitWebInstalled(ctx context.Context) {
mustInit(uinotification.Init)
mustInitCtx(ctx, archiver.Init)
highlight.NewContext()
external.RegisterRenderers()
markup.Init(markup_service.FormalRenderHelperFuncs())
+1 -1
View File
@@ -267,7 +267,7 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa
bufContent := buf.Bytes()
bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{})
highlighted, lexerName := highlight.Code(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent))
highlighted, lexerName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent))
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted)
for i, br := range rows {
var line template.HTML
+3 -3
View File
@@ -124,11 +124,11 @@ func handleFileViewRenderSource(ctx *context.Context, filename string, attrs *at
}
language := attrs.GetLanguage().Value()
fileContent, lexerName, err := highlight.File(filename, language, buf)
fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf)
ctx.Data["LexerName"] = lexerName
if err != nil {
log.Error("highlight.File failed, fallback to plain text: %v", err)
fileContent = highlight.PlainText(buf)
log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err)
fileContent = highlight.RenderPlainText(buf)
}
status := &charset.EscapeStatus{}
statuses := make([]*charset.EscapeStatus, len(fileContent))
+2 -2
View File
@@ -331,7 +331,7 @@ func (diffSection *DiffSection) getLineContentForRender(lineIdx int, diffLine *D
if setting.Git.DisableDiffHighlight {
return template.HTML(html.EscapeString(diffLine.Content[1:]))
}
h, _ = highlight.Code(diffSection.FileName, fileLanguage, diffLine.Content[1:])
h, _ = highlight.RenderCodeFast(diffSection.FileName, fileLanguage, diffLine.Content[1:])
return h
}
@@ -1349,7 +1349,7 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit
func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML {
content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{}))
highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content)
highlightedNewContent, _ := highlight.RenderCodeFast(diffFile.Name, diffFile.Language, content)
unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent)
lines := make(map[int]template.HTML, len(unsafeLines))
// only save the highlighted lines we need, but not the whole file, to save memory