From af6a21ab1d888ab22d0b02e013ebdfef76ddb881 Mon Sep 17 00:00:00 2001 From: oppiliappan Date: Thu, 10 Jul 2025 18:23:56 +0100 Subject: [PATCH] knotserver: simplify language calculation logic Change-Id: plrtnootzxvymkntpxuzuyqqorrnsxnn reference forgejo's language analysis a bit more, and calculate ratios based on number of bytes Signed-off-by: oppiliappan --- knotserver/git/git.go | 29 ++++++++-------- knotserver/routes.go | 81 +++++++++++++++++++++++-------------------- types/repo.go | 2 +- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/knotserver/git/git.go b/knotserver/git/git.go index ace3291..872dafb 100644 --- a/knotserver/git/git.go +++ b/knotserver/git/git.go @@ -2,6 +2,7 @@ package git import ( "archive/tar" + "bytes" "fmt" "io" "io/fs" @@ -201,8 +202,6 @@ func (g *GitRepo) LastCommit() (*object.Commit, error) { } func (g *GitRepo) FileContentN(path string, cap int64) ([]byte, error) { - buf := []byte{} - c, err := g.r.CommitObject(g.h) if err != nil { return nil, fmt.Errorf("commit object: %w", err) @@ -219,21 +218,21 @@ func (g *GitRepo) FileContentN(path string, cap int64) ([]byte, error) { } isbin, _ := file.IsBinary() - - if !isbin { - reader, err := file.Reader() - if err != nil { - return nil, err - } - bufReader := io.LimitReader(reader, cap) - _, err = bufReader.Read(buf) - if err != nil { - return nil, err - } - return buf, nil - } else { + if isbin { return nil, ErrBinaryFile } + + reader, err := file.Reader() + if err != nil { + return nil, err + } + + buf := new(bytes.Buffer) + if _, err = buf.ReadFrom(io.LimitReader(reader, cap)); err != nil { + return nil, err + } + + return buf.Bytes(), nil } func (g *GitRepo) FileContent(path string) (string, error) { diff --git a/knotserver/routes.go b/knotserver/routes.go index 163bd60..1fe3891 100644 --- a/knotserver/routes.go +++ b/knotserver/routes.go @@ -18,6 +18,7 @@ import ( "strconv" "strings" "sync" + "time" securejoin "github.com/cyphar/filepath-securejoin" "github.com/gliderlabs/ssh" @@ -763,73 +764,79 @@ func (h *Handle) RepoForkAheadBehind(w http.ResponseWriter, r *http.Request) { } func (h *Handle) RepoLanguages(w http.ResponseWriter, r *http.Request) { - path, _ := securejoin.SecureJoin(h.c.Repo.ScanPath, didPath(r)) + repoPath, _ := securejoin.SecureJoin(h.c.Repo.ScanPath, didPath(r)) ref := chi.URLParam(r, "ref") ref, _ = url.PathUnescape(ref) l := h.l.With("handler", "RepoLanguages") - gr, err := git.Open(path, ref) + gr, err := git.Open(repoPath, ref) if err != nil { l.Error("opening repo", "error", err.Error()) notFound(w) return } - languageFileCount := make(map[string]int) + sizes := make(map[string]int64) - err = recurseEntireTree(r.Context(), gr, func(absPath string) { - lang, safe := enry.GetLanguageByExtension(absPath) - if len(lang) == 0 || !safe { - content, _ := gr.FileContentN(absPath, 1024) - if !safe { - lang = enry.GetLanguage(absPath, content) - if len(lang) == 0 { - lang = "Other" - } - } else { - lang, _ = enry.GetLanguageByContent(absPath, content) - if len(lang) == 0 { - lang = "Other" - } - } + ctx, cancel := context.WithTimeout(r.Context(), 1*time.Second) + defer cancel() + + err = gr.Walk(ctx, "", func(node object.TreeEntry, parent *object.Tree, root string) error { + filepath := path.Join(root, node.Name) + + content, err := gr.FileContentN(filepath, 16*1024) // 16KB + if err != nil { + return nil } - v, ok := languageFileCount[lang] - if ok { - languageFileCount[lang] = v + 1 - } else { - languageFileCount[lang] = 1 + if enry.IsGenerated(filepath, content) { + return nil + } + + language := analyzeLanguage(node, content) + if group := enry.GetLanguageGroup(language); group != "" { + language = group } - }, "") + + langType := enry.GetLanguageType(language) + if langType != enry.Programming && langType != enry.Markup && langType != enry.Unknown { + return nil + } + + sz, _ := parent.Size(node.Name) + sizes[language] += sz + + return nil + }) if err != nil { l.Error("failed to recurse file tree", "error", err.Error()) writeError(w, err.Error(), http.StatusNoContent) return } - resp := types.RepoLanguageResponse{Languages: languageFileCount} + resp := types.RepoLanguageResponse{Languages: sizes} writeJSON(w, resp) return } -func recurseEntireTree(ctx context.Context, git *git.GitRepo, callback func(absPath string), filePath string) error { - files, err := git.FileTree(ctx, filePath) - if err != nil { - log.Println(err) - return err +func analyzeLanguage(node object.TreeEntry, content []byte) string { + language, ok := enry.GetLanguageByExtension(node.Name) + if ok { + return language } - for _, file := range files { - absPath := path.Join(filePath, file.Name) - if !file.IsFile { - return recurseEntireTree(ctx, git, callback, absPath) - } - callback(absPath) + language, ok = enry.GetLanguageByFilename(node.Name) + if ok { + return language } - return nil + if len(content) == 0 { + return enry.OtherLanguage + } + + return enry.GetLanguage(node.Name, content) } func (h *Handle) RepoForkSync(w http.ResponseWriter, r *http.Request) { diff --git a/types/repo.go b/types/repo.go index 46146e9..e0d3d88 100644 --- a/types/repo.go +++ b/types/repo.go @@ -117,5 +117,5 @@ type RepoLanguageDetails struct { type RepoLanguageResponse struct { // Language: File count - Languages map[string]int `json:"languages"` + Languages map[string]int64 `json:"languages"` } -- 2.43.0