Commit c757113

Eric Bower  ·  2026-05-12 00:06:58 -0400 EDT
parent cce1481
chore: better text file detection
1 files changed,  +64, -6
+64, -6
 1@@ -9,7 +9,6 @@ import (
 2 	"html/template"
 3 	"io"
 4 	"log/slog"
 5-	"math"
 6 	"os"
 7 	"path/filepath"
 8 	"sort"
 9@@ -284,13 +283,72 @@ func isText(s string) bool {
10 	return true
11 }
12 
13+// newSet builds a string set from a list of values for O(1) lookup.
14+func newSet(vals ...string) map[string]struct{} {
15+	m := make(map[string]struct{}, len(vals))
16+	for _, v := range vals {
17+		m[v] = struct{}{}
18+	}
19+	return m
20+}
21+
22+// knownBinaryExts lists extensions that are always binary regardless of content.
23+var knownBinaryExts = newSet(
24+	// images
25+	".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg", ".tiff", ".tif", ".psd",
26+	// archives / compressed
27+	".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".tgz", ".war", ".jar",
28+	// documents
29+	".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
30+	// executables / libraries
31+	".exe", ".dll", ".so", ".dylib", ".a", ".o",
32+	// fonts
33+	".ttf", ".otf", ".woff", ".woff2", ".eot",
34+	// audio / video
35+	".mp3", ".mp4", ".avi", ".mov", ".wav", ".flac", ".ogg", ".webm",
36+	// data / serialized
37+	".pb", ".msgpack", ".parquet", ".avro",
38+	// other
39+	".class", ".pyc", ".pyo", ".wasm", ".db", ".sqlite", ".sqlite3",
40+)
41+
42+// knownTextExts lists extensions that are always text regardless of content.
43+var knownTextExts = newSet(
44+	// code
45+	".go", ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".c", ".h", ".cpp", ".hpp", ".rs", ".rb", ".php", ".pl", ".sh", ".bash", ".zsh", ".fish", ".ps1",
46+	// markup / data
47+	".html", ".htm", ".css", ".scss", ".less", ".xml", ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf",
48+	// docs
49+	".md", ".markdown", ".txt", ".rst", ".tex", ".bib", ".csv", ".tsv",
50+	// config / build
51+	".Dockerfile", ".dockerignore", ".gitignore", ".gitattributes", ".editorconfig",
52+	// other text
53+	".diff", ".patch", ".log", ".sql", ".graphql", ".proto", ".makefile", ".cmake",
54+)
55+
56 // isTextFile reports whether the file has a known extension indicating
57 // a text file, or if a significant chunk of the specified file looks like
58 // correct UTF-8; that is, if it is likely that the file contains human-
59-// readable text.
60-func isTextFile(text string) bool {
61-	num := math.Min(float64(len(text)), 1024)
62-	return isText(text[0:int(num)])
63+// readable text. Extension check takes priority as a fast path.
64+func isTextFile(filename, text string) bool {
65+	ext := strings.ToLower(filepath.Ext(filename))
66+
67+	// fast path: known binary extension
68+	if _, ok := knownBinaryExts[ext]; ok {
69+		return false
70+	}
71+	// fast path: known text extension
72+	if _, ok := knownTextExts[ext]; ok {
73+		return true
74+	}
75+	// also check the full filename for extensionless known text files
76+	nameLower := strings.ToLower(filename)
77+	if _, ok := knownTextExts["."+nameLower]; ok {
78+		return true
79+	}
80+
81+	// fallback: inspect bytes
82+	return isText(text)
83 }
84 
85 func toPretty(b int64) string {
86@@ -454,7 +512,7 @@ func (c *Config) writeHTMLTreeFile(pageData *PageData, treeItem *TreeItem) strin
87 	bail(err)
88 	str := string(b)
89 
90-	treeItem.IsTextFile = isTextFile(str)
91+	treeItem.IsTextFile = isTextFile(treeItem.Entry.Name(), str)
92 
93 	contents := "binary file, cannot display"
94 	if treeItem.IsTextFile {