Skip to content

Convert files to utf-8 for indexing #7814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 15, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"code.gitea.io/gitea/modules/setting"

"github.com/ethantkoenig/rupture"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// RepoIndexerStatus status of a repo's entry in the repo indexer
Expand Down Expand Up @@ -207,14 +209,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
indexerUpdate := indexer.RepoIndexerUpdate{
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Content: string(fileContents),
Content: string(toUTF8DropErrors(fileContents)),
},
}
return indexerUpdate.AddToFlushingBatch(batch)
Expand Down Expand Up @@ -360,3 +363,36 @@ func addOperationToQueue(op repoIndexerOperation) {
}()
}
}

// toUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func toUTF8DropErrors(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return base.RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
decoded = append(decoded, ' ')
idx = idx + n + 1
if idx >= len(content) {
break
}
}

return base.RemoveBOMIfPresent(decoded)
}