knotserver: simplify language calculation logic #270

merged
opened by oppi.li targeting master from push-zvkrywwskknq

reference forgejo's language analysis a bit more, and calculate ratios based on number of bytes

Signed-off-by: oppiliappan me@oppi.li

Changed files
+59 -53
knotserver
types
+14 -15
knotserver/git/git.go
···
import (
"archive/tar"
+
"bytes"
"fmt"
"io"
"io/fs"
···
}
func (g *GitRepo) FileContentN(path string, cap int64) ([]byte, error) {
-
buf := []byte{}
-
c, err := g.r.CommitObject(g.h)
if err != nil {
return nil, fmt.Errorf("commit object: %w", err)
···
}
isbin, _ := file.IsBinary()
-
-
if !isbin {
-
reader, err := file.Reader()
-
if err != nil {
-
return nil, err
-
}
-
bufReader := io.LimitReader(reader, cap)
-
_, err = bufReader.Read(buf)
-
if err != nil {
-
return nil, err
-
}
-
return buf, nil
-
} else {
+
if isbin {
return nil, ErrBinaryFile
}
+
+
reader, err := file.Reader()
+
if err != nil {
+
return nil, err
+
}
+
+
buf := new(bytes.Buffer)
+
if _, err = buf.ReadFrom(io.LimitReader(reader, cap)); err != nil {
+
return nil, err
+
}
+
+
return buf.Bytes(), nil
}
func (g *GitRepo) FileContent(path string) (string, error) {
+44 -37
knotserver/routes.go
···
"strconv"
"strings"
"sync"
+
"time"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/gliderlabs/ssh"
···
}
func (h *Handle) RepoLanguages(w http.ResponseWriter, r *http.Request) {
-
path, _ := securejoin.SecureJoin(h.c.Repo.ScanPath, didPath(r))
+
repoPath, _ := securejoin.SecureJoin(h.c.Repo.ScanPath, didPath(r))
ref := chi.URLParam(r, "ref")
ref, _ = url.PathUnescape(ref)
l := h.l.With("handler", "RepoLanguages")
-
gr, err := git.Open(path, ref)
+
gr, err := git.Open(repoPath, ref)
if err != nil {
l.Error("opening repo", "error", err.Error())
notFound(w)
return
}
-
languageFileCount := make(map[string]int)
+
sizes := make(map[string]int64)
-
err = recurseEntireTree(r.Context(), gr, func(absPath string) {
-
lang, safe := enry.GetLanguageByExtension(absPath)
-
if len(lang) == 0 || !safe {
-
content, _ := gr.FileContentN(absPath, 1024)
-
if !safe {
-
lang = enry.GetLanguage(absPath, content)
-
if len(lang) == 0 {
-
lang = "Other"
-
}
-
} else {
-
lang, _ = enry.GetLanguageByContent(absPath, content)
-
if len(lang) == 0 {
-
lang = "Other"
-
}
-
}
+
ctx, cancel := context.WithTimeout(r.Context(), 1*time.Second)
+
defer cancel()
+
+
err = gr.Walk(ctx, "", func(node object.TreeEntry, parent *object.Tree, root string) error {
+
filepath := path.Join(root, node.Name)
+
+
content, err := gr.FileContentN(filepath, 16*1024) // 16KB
+
if err != nil {
+
return nil
}
-
v, ok := languageFileCount[lang]
-
if ok {
-
languageFileCount[lang] = v + 1
-
} else {
-
languageFileCount[lang] = 1
+
if enry.IsGenerated(filepath, content) {
+
return nil
+
}
+
+
language := analyzeLanguage(node, content)
+
if group := enry.GetLanguageGroup(language); group != "" {
+
language = group
}
-
}, "")
+
+
langType := enry.GetLanguageType(language)
+
if langType != enry.Programming && langType != enry.Markup && langType != enry.Unknown {
+
return nil
+
}
+
+
sz, _ := parent.Size(node.Name)
+
sizes[language] += sz
+
+
return nil
+
})
if err != nil {
l.Error("failed to recurse file tree", "error", err.Error())
writeError(w, err.Error(), http.StatusNoContent)
return
}
-
resp := types.RepoLanguageResponse{Languages: languageFileCount}
+
resp := types.RepoLanguageResponse{Languages: sizes}
writeJSON(w, resp)
return
}
-
func recurseEntireTree(ctx context.Context, git *git.GitRepo, callback func(absPath string), filePath string) error {
-
files, err := git.FileTree(ctx, filePath)
-
if err != nil {
-
log.Println(err)
-
return err
+
func analyzeLanguage(node object.TreeEntry, content []byte) string {
+
language, ok := enry.GetLanguageByExtension(node.Name)
+
if ok {
+
return language
}
-
for _, file := range files {
-
absPath := path.Join(filePath, file.Name)
-
if !file.IsFile {
-
return recurseEntireTree(ctx, git, callback, absPath)
-
}
-
callback(absPath)
+
language, ok = enry.GetLanguageByFilename(node.Name)
+
if ok {
+
return language
}
-
return nil
+
if len(content) == 0 {
+
return enry.OtherLanguage
+
}
+
+
return enry.GetLanguage(node.Name, content)
}
func (h *Handle) RepoForkSync(w http.ResponseWriter, r *http.Request) {
+1 -1
types/repo.go
···
type RepoLanguageResponse struct {
// Language: File count
-
Languages map[string]int `json:"languages"`
+
Languages map[string]int64 `json:"languages"`
}