Skip to content

Commit 8daace8

Browse files
committed
[GlobPattern] Support brace expansions
Extend `GlobPattern` to support brace expansions, e.g., `foo.{c,cpp}` as discussed in https://reviews.llvm.org/D152762#4425203. The high level change was to turn `Tokens` into a list that gets larger when we see a new brace expansion term. Then in `GlobPattern::match()` we must check against each token group. This is a breaking change since `{` will no longer match a literal without escaping. However, `\{` will match the literal `{` before and after this change. Also, from a brief survey of LLVM, it seems that `GlobPattern` is mostly used for symbol and path matching, which likely won't need `{` in their patterns. See https://github.com/devongovett/glob-match#syntax for a nice glob reference. Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D153587
1 parent abacab6 commit 8daace8

File tree

3 files changed

+253
-34
lines changed

3 files changed

+253
-34
lines changed

llvm/include/llvm/Support/GlobPattern.h

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
//
9-
// This file implements a glob pattern matcher. The glob pattern is the
10-
// rule used by the shell.
9+
// This file implements a glob pattern matcher.
1110
//
1211
//===----------------------------------------------------------------------===//
1312

@@ -20,30 +19,72 @@
2019
#include "llvm/Support/Error.h"
2120
#include <optional>
2221

23-
// This class represents a glob pattern. Supported metacharacters
24-
// are "*", "?", "\", "[<chars>]", "[^<chars>]", and "[!<chars>]".
2522
namespace llvm {
2623

24+
/// This class implements a glob pattern matcher similar to the one found in
25+
/// bash, but with some key differences. Namely, that \p "*" matches all
26+
/// characters and does not exclude path separators.
27+
///
28+
/// * \p "?" matches a single character.
29+
/// * \p "*" matches zero or more characters.
30+
/// * \p "[<chars>]" matches one character in the bracket. Character ranges,
31+
/// e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also
32+
/// supported.
33+
/// * \p "{<glob>,...}" matches one of the globs in the list. Nested brace
34+
/// expansions are not supported. If \p MaxSubPatterns is empty then
35+
/// characters \p "{,}" are treated as literals.
36+
/// * \p "\" escapes the next character so it is treated as a literal.
37+
///
38+
/// Some known edge cases are:
39+
/// * \p "]" is allowed as the first character in a character class, i.e.,
40+
/// \p "[]]" is valid and matches the literal \p "]".
41+
/// * The empty character class, i.e., \p "[]", is invalid.
42+
/// * Empty or singleton brace expansions, e.g., \p "{}", \p "{a}", are invalid.
43+
/// * \p "}" and \p "," that are not inside a brace expansion are taken as
44+
/// literals, e.g., \p ",}" is valid but \p "{" is not.
45+
///
46+
/// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to
47+
/// all files named \p "foo.c" or \p "foo.cpp".
2748
class GlobPattern {
2849
public:
29-
static Expected<GlobPattern> create(StringRef Pat);
50+
/// \param Pat the pattern to match against
51+
/// \param MaxSubPatterns if provided limit the number of allowed subpatterns
52+
/// created from expanding braces otherwise disable
53+
/// brace expansion
54+
static Expected<GlobPattern>
55+
create(StringRef Pat, std::optional<size_t> MaxSubPatterns = {});
56+
/// \returns \p true if \p S matches this glob pattern
3057
bool match(StringRef S) const;
3158

3259
// Returns true for glob pattern "*". Can be used to avoid expensive
3360
// preparation/acquisition of the input for match().
34-
bool isTrivialMatchAll() const { return Prefix.empty() && Pat == "*"; }
61+
bool isTrivialMatchAll() const {
62+
if (!Prefix.empty())
63+
return false;
64+
if (SubGlobs.size() != 1)
65+
return false;
66+
return SubGlobs[0].getPat() == "*";
67+
}
3568

3669
private:
37-
bool matchOne(StringRef Str) const;
70+
StringRef Prefix;
3871

39-
// Brackets with their end position and matched bytes.
40-
struct Bracket {
41-
const char *Next;
42-
BitVector Bytes;
43-
};
44-
SmallVector<Bracket, 0> Brackets;
72+
struct SubGlobPattern {
73+
/// \param Pat the pattern to match against
74+
static Expected<SubGlobPattern> create(StringRef Pat);
75+
/// \returns \p true if \p S matches this glob pattern
76+
bool match(StringRef S) const;
77+
StringRef getPat() const { return StringRef(Pat.data(), Pat.size()); }
4578

46-
StringRef Prefix, Pat;
79+
// Brackets with their end position and matched bytes.
80+
struct Bracket {
81+
size_t NextOffset;
82+
BitVector Bytes;
83+
};
84+
SmallVector<Bracket, 0> Brackets;
85+
SmallVector<char, 0> Pat;
86+
};
87+
SmallVector<SubGlobPattern, 1> SubGlobs;
4788
};
4889
}
4990

llvm/lib/Support/GlobPattern.cpp

Lines changed: 112 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "llvm/Support/GlobPattern.h"
14-
#include "llvm/ADT/ArrayRef.h"
1514
#include "llvm/ADT/StringRef.h"
1615
#include "llvm/Support/Errc.h"
1716

@@ -54,18 +53,115 @@ static Expected<BitVector> expand(StringRef S, StringRef Original) {
5453
return BV;
5554
}
5655

57-
Expected<GlobPattern> GlobPattern::create(StringRef S) {
56+
// Identify brace expansions in S and return the list of patterns they expand
57+
// into.
58+
static Expected<SmallVector<std::string, 1>>
59+
parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
60+
SmallVector<std::string> SubPatterns = {S.str()};
61+
if (!MaxSubPatterns || !S.contains('{'))
62+
return SubPatterns;
63+
64+
struct BraceExpansion {
65+
size_t Start;
66+
size_t Length;
67+
SmallVector<StringRef, 2> Terms;
68+
};
69+
SmallVector<BraceExpansion, 0> BraceExpansions;
70+
71+
BraceExpansion *CurrentBE = nullptr;
72+
size_t TermBegin;
73+
for (size_t I = 0, E = S.size(); I != E; ++I) {
74+
if (S[I] == '[') {
75+
I = S.find(']', I + 2);
76+
if (I == std::string::npos)
77+
return make_error<StringError>("invalid glob pattern, unmatched '['",
78+
errc::invalid_argument);
79+
} else if (S[I] == '{') {
80+
if (CurrentBE)
81+
return make_error<StringError>(
82+
"nested brace expansions are not supported",
83+
errc::invalid_argument);
84+
CurrentBE = &BraceExpansions.emplace_back();
85+
CurrentBE->Start = I;
86+
TermBegin = I + 1;
87+
} else if (S[I] == ',') {
88+
if (!CurrentBE)
89+
continue;
90+
CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
91+
TermBegin = I + 1;
92+
} else if (S[I] == '}') {
93+
if (!CurrentBE)
94+
continue;
95+
if (CurrentBE->Terms.empty())
96+
return make_error<StringError>(
97+
"empty or singleton brace expansions are not supported",
98+
errc::invalid_argument);
99+
CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
100+
CurrentBE->Length = I - CurrentBE->Start + 1;
101+
CurrentBE = nullptr;
102+
} else if (S[I] == '\\') {
103+
if (++I == E)
104+
return make_error<StringError>("invalid glob pattern, stray '\\'",
105+
errc::invalid_argument);
106+
}
107+
}
108+
if (CurrentBE)
109+
return make_error<StringError>("incomplete brace expansion",
110+
errc::invalid_argument);
111+
112+
size_t NumSubPatterns = 1;
113+
for (auto &BE : BraceExpansions) {
114+
if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
115+
NumSubPatterns = std::numeric_limits<size_t>::max();
116+
break;
117+
}
118+
NumSubPatterns *= BE.Terms.size();
119+
}
120+
if (NumSubPatterns > *MaxSubPatterns)
121+
return make_error<StringError>("too many brace expansions",
122+
errc::invalid_argument);
123+
// Replace brace expansions in reverse order so that we don't invalidate
124+
// earlier start indices
125+
for (auto &BE : reverse(BraceExpansions)) {
126+
SmallVector<std::string> OrigSubPatterns;
127+
std::swap(SubPatterns, OrigSubPatterns);
128+
for (StringRef Term : BE.Terms)
129+
for (StringRef Orig : OrigSubPatterns)
130+
SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
131+
}
132+
return SubPatterns;
133+
}
134+
135+
Expected<GlobPattern>
136+
GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
58137
GlobPattern Pat;
59138

60139
// Store the prefix that does not contain any metacharacter.
61-
size_t PrefixSize = S.find_first_of("?*[\\");
140+
size_t PrefixSize = S.find_first_of("?*[{\\");
62141
Pat.Prefix = S.substr(0, PrefixSize);
63142
if (PrefixSize == std::string::npos)
64143
return Pat;
65144
S = S.substr(PrefixSize);
66145

146+
SmallVector<std::string, 1> SubPats;
147+
if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
148+
return Err;
149+
for (StringRef SubPat : SubPats) {
150+
auto SubGlobOrErr = SubGlobPattern::create(SubPat);
151+
if (!SubGlobOrErr)
152+
return SubGlobOrErr.takeError();
153+
Pat.SubGlobs.push_back(*SubGlobOrErr);
154+
}
155+
156+
return Pat;
157+
}
158+
159+
Expected<GlobPattern::SubGlobPattern>
160+
GlobPattern::SubGlobPattern::create(StringRef S) {
161+
SubGlobPattern Pat;
162+
67163
// Parse brackets.
68-
Pat.Pat = S;
164+
Pat.Pat.assign(S.begin(), S.end());
69165
for (size_t I = 0, E = S.size(); I != E; ++I) {
70166
if (S[I] == '[') {
71167
// ']' is allowed as the first character of a character class. '[]' is
@@ -83,7 +179,7 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
83179
return BV.takeError();
84180
if (Invert)
85181
BV->flip();
86-
Pat.Brackets.push_back(Bracket{S.data() + J + 1, std::move(*BV)});
182+
Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
87183
I = J;
88184
} else if (S[I] == '\\') {
89185
if (++I == E)
@@ -95,13 +191,20 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
95191
}
96192

97193
bool GlobPattern::match(StringRef S) const {
98-
return S.consume_front(Prefix) && matchOne(S);
194+
if (!S.consume_front(Prefix))
195+
return false;
196+
if (SubGlobs.empty() && S.empty())
197+
return true;
198+
for (auto &Glob : SubGlobs)
199+
if (Glob.match(S))
200+
return true;
201+
return false;
99202
}
100203

101204
// Factor the pattern into segments split by '*'. The segment is matched
102205
// sequentianlly by finding the first occurrence past the end of the previous
103206
// match.
104-
bool GlobPattern::matchOne(StringRef Str) const {
207+
bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
105208
const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
106209
*SavedS = S;
107210
const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
@@ -118,7 +221,7 @@ bool GlobPattern::matchOne(StringRef Str) const {
118221
continue;
119222
} else if (*P == '[') {
120223
if (Brackets[B].Bytes[uint8_t(*S)]) {
121-
P = Brackets[B++].Next;
224+
P = Pat.data() + Brackets[B++].NextOffset;
122225
++S;
123226
continue;
124227
}
@@ -143,5 +246,5 @@ bool GlobPattern::matchOne(StringRef Str) const {
143246
}
144247
// All bytes in Str have been matched. Return true if the rest part of Pat is
145248
// empty or contains only '*'.
146-
return Pat.find_first_not_of('*', P - Pat.data()) == std::string::npos;
249+
return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
147250
}

0 commit comments

Comments
 (0)