go-gitea · lunny · Aug 15, 2019 · Aug 10, 2019 · Aug 11, 2019 · Aug 11, 2019
diff --git a/models/repo_indexer.go b/models/repo_indexer.go
@@ -16,6 +16,8 @@ import (
 	"code.gitea.io/gitea/modules/setting"
 
 	"github.com/ethantkoenig/rupture"
+	"golang.org/x/net/html/charset"
+	"golang.org/x/text/transform"
 )
 
 // RepoIndexerStatus status of a repo's entry in the repo indexer
@@ -207,14 +209,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
 	if err != nil {
 		return err
 	} else if !base.IsTextFile(fileContents) {
+		// FIXME: UTF-16 files will probably fail here
 		return nil
 	}
 	indexerUpdate := indexer.RepoIndexerUpdate{
 		Filepath: update.Filename,
 		Op:       indexer.RepoIndexerOpUpdate,
 		Data: &indexer.RepoIndexerData{
 			RepoID:  repo.ID,
-			Content: string(fileContents),
+			Content: string(toUTF8DropErrors(fileContents)),
 		},
 	}
 	return indexerUpdate.AddToFlushingBatch(batch)
@@ -360,3 +363,36 @@ func addOperationToQueue(op repoIndexerOperation) {
 		}()
 	}
 }
+
+// toUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
+func toUTF8DropErrors(content []byte) []byte {
+	charsetLabel, err := base.DetectEncoding(content)
+	if err != nil || charsetLabel == "UTF-8" {
+		return base.RemoveBOMIfPresent(content)
+	}
+
+	encoding, _ := charset.Lookup(charsetLabel)
+	if encoding == nil {
+		return content
+	}
+
+	// We ignore any non-decodable parts from the file.
+	// Some parts might be lost
+	var decoded []byte
+	decoder := encoding.NewDecoder()
+	idx := 0
+	for {
+		result, n, err := transform.Bytes(decoder, content[idx:])
+		decoded = append(decoded, result...)
+		if err == nil {
+			break
+		}
+		decoded = append(decoded, ' ')
+		idx = idx + n + 1
+		if idx >= len(content) {
+			break
+		}
+	}
+
+	return base.RemoveBOMIfPresent(decoded)
+}