Skip to content

Commit f33d7fd

Browse files
committed
fix: improve host name detection
1 parent b5ecf9a commit f33d7fd

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

notwords.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@ import (
44
"bytes"
55
"regexp"
66
"strings"
7+
"unicode"
78
)
89

910
var (
10-
reEmail = regexp.MustCompile(`[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9-.]+\.[a-zA-Z]{2,6}[^a-zA-Z]`)
11-
reHost = regexp.MustCompile(`[a-zA-Z0-9-.]+\.[a-zA-Z]+`)
12-
reBackslash = regexp.MustCompile(`\\[a-z]`)
11+
reEmail = regexp.MustCompile(`[[:alnum:]_.%+-]+@[[:alnum:]-.]+\.[[:alpha:]]{2,6}[^[:alpha:]]`)
12+
reBackslash = regexp.MustCompile(`\\[[:lower:]]`)
13+
14+
// reHost Host name regular expression.
15+
// The length of any one label is limited between 1 and 63 octets. (https://www.ietf.org/rfc/rfc2181.txt)
16+
// A TLD has at least 2 letters.
17+
reHost = regexp.MustCompile(`([[:alnum:]-]+\.)+[[:alpha:]]{2,63}`)
1318
)
1419

1520
// RemovePath attempts to strip away embedded file system paths, e.g.
@@ -62,14 +67,26 @@ func replaceWithBlanks(s string) string {
6267
return strings.Repeat(" ", len(s))
6368
}
6469

70+
// replaceHost same as replaceWithBlanks but if the string contains at least one uppercase letter returns the string.
71+
// Domain names are case-insensitive but browsers and DNS convert uppercase to lower case. (https://www.ietf.org/rfc/rfc4343.txt)
72+
func replaceHost(s string) string {
73+
for _, r := range s {
74+
if unicode.IsUpper(r) {
75+
return s
76+
}
77+
}
78+
79+
return replaceWithBlanks(s)
80+
}
81+
6582
// RemoveEmail remove email-like strings, e.g. "[email protected]", "[email protected]".
6683
func RemoveEmail(s string) string {
6784
return reEmail.ReplaceAllStringFunc(s, replaceWithBlanks)
6885
}
6986

7087
// RemoveHost removes host-like strings "foobar.com" "abc123.fo1231.biz".
7188
func RemoveHost(s string) string {
72-
return reHost.ReplaceAllStringFunc(s, replaceWithBlanks)
89+
return reHost.ReplaceAllStringFunc(s, replaceHost)
7390
}
7491

7592
// RemoveBackslashEscapes removes characters that are preceded by a backslash.

notwords_test.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,20 @@ func TestNotWords(t *testing.T) {
1414
{word: "[/foo/bar] abc", want: "[ ] abc"},
1515
{word: "/", want: "/"},
1616
{word: "x [email protected] y", want: "x y"},
17+
{word: "x fqdn.example.org. y", want: "x . y"},
1718
{word: "x infinitie.net y", want: "x y"},
18-
{word: "(s.svc.GetObject(", want: "( ("},
19+
{word: "x infinitie.net ", want: "x "},
20+
{word: "x infinitie.net", want: "x "},
21+
{word: "x foo.example.com y", want: "x y"},
22+
{word: "x foo.example.com ", want: "x "},
23+
{word: "x foo.example.com", want: "x "},
24+
{word: "foo.example.com y", want: " y"},
25+
{word: "foo.example.com", want: " "},
26+
{word: "(s.svc.GetObject(", want: "(s.svc.GetObject("},
27+
{word: "defer file.Close()", want: "defer file.Close()"},
28+
{word: "defer file.c()", want: "defer file.c()"},
29+
{word: "defer file.cl()", want: "defer ()"}, // false negative
30+
{word: "defer file.close()", want: "defer ()"}, // false negative
1931
{word: "\\nto", want: " to"},
2032
}
2133

0 commit comments

Comments
 (0)