Skip to content

Commit ac07b67

Browse files
author
José Valim
committed
Add String.splitter/3 and optimize String.split/3
1 parent a58a4eb commit ac07b67

File tree

2 files changed

+134
-115
lines changed

2 files changed

+134
-115
lines changed

lib/elixir/lib/string.ex

Lines changed: 120 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,34 @@ defmodule String do
169169
when retrieving data from the external source. For example, a
170170
driver that reads strings from a database will be the one
171171
responsible to check the validity of the encoding.
172+
173+
## Patterns
174+
175+
Many functions in this module work with patterns. For example,
176+
String.split/2 can split a string into multiple patterns given
177+
a pattern. This pattern can be a string, a list of strings or
178+
a compiled pattern:
179+
180+
iex> String.split("foo bar", " ")
181+
["foo", "bar"]
182+
183+
iex> String.split("foo bar!", [" ", "!"])
184+
["foo", "bar", ""]
185+
186+
iex> pattern = :binary.compile_pattern([" ", "!"])
187+
iex> String.split("foo bar!", pattern)
188+
["foo", "bar", ""]
189+
190+
The compiled pattern is useful when the same match will
191+
be done over and oever again. Note though the compiled
192+
pattern cannot be stored in a module attribute as the pattern
193+
is generated at runtime and does not survive compile term.
172194
"""
173195

174196
@type t :: binary
175197
@type codepoint :: t
176198
@type grapheme :: t
199+
@type pattern :: t | [t] | :binary.cp
177200

178201
@doc """
179202
Checks if a string is printable considering it is encoded
@@ -285,66 +308,100 @@ defmodule String do
285308
iex> String.split("abc", "", parts: 2)
286309
["a", "bc"]
287310
311+
A precompiled pattern can also be given:
312+
313+
iex> pattern = :binary.compile_pattern([" ", ","])
314+
iex> String.split("1,2 3,4", pattern)
315+
["1", "2", "3", "4"]
316+
288317
"""
289-
@spec split(t, t | [t] | Regex.t) :: [t]
290-
@spec split(t, t | [t] | Regex.t, Keyword.t) :: [t]
318+
@spec split(t, pattern | Regex.t) :: [t]
319+
@spec split(t, pattern | Regex.t, Keyword.t) :: [t]
291320
def split(string, pattern, options \\ [])
292321

293-
def split(string, "", options) do
294-
parts = Keyword.get(options, :parts, :infinity)
295-
split_codepoints(string, parts_to_index(parts), Keyword.get(options, :trim, false))
322+
def split(string, %Regex{} = pattern, options) do
323+
Regex.split(pattern, string, options)
324+
end
325+
326+
def split(string, pattern, []) when pattern != "" do
327+
:binary.split(string, pattern, [:global])
296328
end
297329

298330
def split(string, pattern, options) do
299-
if Regex.regex?(pattern) do
300-
Regex.split(pattern, string, options)
301-
else
302-
parts = Keyword.get(options, :parts, :infinity)
303-
trim = Keyword.get(options, :trim, false)
304-
if parts == :infinity and trim == false do
305-
:binary.split(string, pattern, [:global])
306-
else
307-
split_parts(string, pattern, parts_to_index(parts), trim)
308-
end
309-
end
331+
parts = Keyword.get(options, :parts, :infinity)
332+
trim = Keyword.get(options, :trim, false)
333+
pattern = maybe_compile_pattern(pattern)
334+
split_each(string, pattern, trim, parts_to_index(parts))
310335
end
311336

312337
defp parts_to_index(:infinity), do: 0
313338
defp parts_to_index(n) when is_integer(n) and n > 0, do: n
314339

315-
defp split_codepoints(binary, 1, _trim), do: [binary]
316-
defp split_codepoints(<<h :: utf8, t :: binary>>, count, trim),
317-
do: [<<h :: utf8>>|split_codepoints(t, count - 1, trim)]
318-
defp split_codepoints(<<h, t :: binary>>, count, trim),
319-
do: [<<h>>|split_codepoints(t, count - 1, trim)]
320-
defp split_codepoints(<<>>, _, true), do: []
321-
defp split_codepoints(<<>>, _, false), do: [""]
322-
323-
defp split_parts("", _pattern, _num, true), do: []
324-
defp split_parts("", _pattern, _num, _trim), do: [""]
325-
defp split_parts(string, _pattern, 1, _trim), do: [string]
326-
defp split_parts(string, pattern, num, trim) do
327-
case :binary.split(string, pattern) do
328-
[""] when trim ->
329-
[]
330-
[head] ->
331-
[head]
332-
[head, tail] ->
333-
if trim and head == "" do
334-
split_parts(tail, pattern, num, trim)
335-
else
336-
[head|split_parts(tail, pattern, num-1, trim)]
337-
end
340+
defp split_each(string, _pattern, _trim, 1), do: [string]
341+
defp split_each(string, pattern, trim, count) do
342+
case do_splitter(string, pattern, trim) do
343+
{h, t} -> [h|split_each(t, pattern, trim, count - 1)]
344+
nil -> []
338345
end
339346
end
340347

348+
@doc """
349+
Splits a string on demand.
350+
351+
Returns an enumerable that splits the string on
352+
demand, instead of splitting all data upfront.
353+
354+
Note splitter does not support regular expressions
355+
(as it is often more efficient to have the regular
356+
expressions traverse the string at once then in
357+
multiple passes).
358+
359+
## Options
360+
361+
* :trim - when true, does not emit empty patterns
362+
"""
363+
@spec splitter(t, pattern, Keyword.t) :: Enumerable.t
364+
def splitter(string, pattern, options \\ []) do
365+
pattern = maybe_compile_pattern(pattern)
366+
trim = Keyword.get(options, :trim, false)
367+
Stream.unfold(string, &do_splitter(&1, pattern, trim))
368+
end
369+
370+
defp do_splitter(:nomatch, _pattern, _), do: nil
371+
defp do_splitter("", _pattern, true), do: nil
372+
defp do_splitter("", _pattern, false), do: {"", :nomatch}
373+
374+
defp do_splitter(bin, "", _trim) do
375+
next_grapheme(bin)
376+
end
377+
378+
defp do_splitter(bin, pattern, trim) do
379+
case :binary.match(bin, pattern) do
380+
{0, length} when trim ->
381+
do_splitter(:binary.part(bin, length, byte_size(bin) - length), pattern, trim)
382+
{pos, length} ->
383+
final = pos + length
384+
{:binary.part(bin, 0, pos),
385+
:binary.part(bin, final, byte_size(bin) - final)}
386+
:nomatch ->
387+
{bin, :nomatch}
388+
end
389+
end
390+
391+
defp maybe_compile_pattern(""), do: ""
392+
defp maybe_compile_pattern(pattern), do: :binary.compile_pattern(pattern)
393+
341394
@doc """
342395
Splits a string into two at the specified offset. When the offset given is
343396
negative, location is counted from the end of the string.
344397
345-
The offset is capped to the length of the string.
398+
The offset is capped to the length of the string. Returns a tuple with
399+
two elements.
346400
347-
Returns a tuple with two elements.
401+
Note: keep in mind this function splits on graphemes and for such it
402+
has to linearly traverse the string. If you want to split a string or
403+
a binary based on the number of bytes, use `Kernel.binary_part/3`
404+
instead.
348405
349406
## Examples
350407
@@ -663,9 +720,7 @@ defmodule String do
663720
"a[,,]b[,,]c"
664721
665722
"""
666-
@spec replace(t, t | Regex.t, t) :: t
667-
@spec replace(t, t | Regex.t, t, Keyword.t) :: t
668-
723+
@spec replace(t, pattern | Regex.t, t, Keyword.t) :: t
669724
def replace(subject, pattern, replacement, options \\ []) when is_binary(replacement) do
670725
if Regex.regex?(pattern) do
671726
Regex.replace(pattern, subject, replacement, global: options[:global])
@@ -872,7 +927,6 @@ defmodule String do
872927
do_chunk(str, pred_fn.(cp), pred_fn)
873928
end
874929

875-
876930
defp do_chunk(str, flag, pred_fn), do: do_chunk(str, [], <<>>, flag, pred_fn)
877931

878932
defp do_chunk(<<>>, acc, <<>>, _, _), do: Enum.reverse(acc)
@@ -1215,19 +1269,13 @@ defmodule String do
12151269
"""
12161270
@spec starts_with?(t, t | [t]) :: boolean
12171271

1218-
def starts_with?(string, prefixes) when is_list(prefixes) do
1219-
Enum.any?(prefixes, &do_starts_with(string, &1))
1220-
end
1221-
1222-
def starts_with?(string, prefix) do
1223-
do_starts_with(string, prefix)
1224-
end
1225-
1226-
defp do_starts_with(string, "") when is_binary(string) do
1272+
def starts_with?(_string, "") do
1273+
IO.puts :stderr, "[deprecation] Calling String.starts_with?/2 with an empty string is deprecated and " <>
1274+
"will fail in the future\n" <> Exception.format_stacktrace()
12271275
true
12281276
end
12291277

1230-
defp do_starts_with(string, prefix) when is_binary(prefix) do
1278+
def starts_with?(string, prefix) when is_list(prefix) or is_binary(prefix) do
12311279
Kernel.match?({0, _}, :binary.match(string, prefix))
12321280
end
12331281

@@ -1249,6 +1297,12 @@ defmodule String do
12491297
"""
12501298
@spec ends_with?(t, t | [t]) :: boolean
12511299

1300+
def ends_with?(_string, "") do
1301+
IO.puts :stderr, "[deprecation] Calling String.ends_with?/2 with an empty string is deprecated and " <>
1302+
"will fail in the future\n" <> Exception.format_stacktrace()
1303+
true
1304+
end
1305+
12521306
def ends_with?(string, suffixes) when is_list(suffixes) do
12531307
Enum.any?(suffixes, &do_ends_with(string, &1))
12541308
end
@@ -1257,10 +1311,6 @@ defmodule String do
12571311
do_ends_with(string, suffix)
12581312
end
12591313

1260-
defp do_ends_with(string, "") when is_binary(string) do
1261-
true
1262-
end
1263-
12641314
defp do_ends_with(string, suffix) when is_binary(suffix) do
12651315
string_size = byte_size(string)
12661316
suffix_size = byte_size(suffix)
@@ -1301,23 +1351,23 @@ defmodule String do
13011351
iex> String.contains? "elixir of life", ["death", "mercury"]
13021352
false
13031353
1304-
"""
1305-
@spec contains?(t, t | [t]) :: boolean
1354+
The argument can also be a precompiled pattern:
13061355
1307-
def contains?(string, contents) when is_list(contents) do
1308-
Enum.any?(contents, &do_contains(string, &1))
1309-
end
1356+
iex> pattern = :binary.compile_pattern(["life", "death"])
1357+
iex> String.contains? "elixir of life", pattern
1358+
true
13101359
1311-
def contains?(string, content) do
1312-
do_contains(string, content)
1313-
end
1360+
"""
1361+
@spec contains?(t, pattern) :: boolean
13141362

1315-
defp do_contains(string, "") when is_binary(string) do
1363+
def contains?(_string, "") do
1364+
IO.puts :stderr, "[deprecation] Calling String.contains?/2 with an empty string is deprecated and " <>
1365+
"will fail in the future\n" <> Exception.format_stacktrace()
13161366
true
13171367
end
13181368

1319-
defp do_contains(string, match) when is_binary(match) do
1320-
:nomatch != :binary.match(string, match)
1369+
def contains?(string, contents) do
1370+
:binary.match(string, contents) != :nomatch
13211371
end
13221372

13231373
@doc """

lib/elixir/test/elixir/string_test.exs

Lines changed: 14 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ defmodule StringTest do
5656
assert String.split("abé", "", trim: true) == ["a", "b", "é"]
5757
assert String.split("abé", "", trim: true, parts: :infinity) == ["a", "b", "é"]
5858
assert String.split("abé", "", trim: true, parts: 2) == ["a", "bé"]
59+
60+
assert String.split("noël", "") == ["n", "o", "ë", "l", ""]
5961
end
6062

6163
test :split_with_regex do
@@ -69,6 +71,18 @@ defmodule StringTest do
6971
assert String.split("a,b", ~r{\.}) == ["a,b"]
7072
end
7173

74+
test :splitter do
75+
assert String.splitter("a,b,c", ",") |> Enum.to_list == ["a", "b", "c"]
76+
assert String.splitter("a,b", ".") |> Enum.to_list == ["a,b"]
77+
assert String.splitter("1,2 3,4", [" ", ","]) |> Enum.to_list == ["1", "2", "3", "4"]
78+
assert String.splitter("", ",") |> Enum.to_list == [""]
79+
80+
assert String.splitter("", ",", trim: true) |> Enum.to_list == []
81+
assert String.splitter(" a b c ", " ", trim: true) |> Enum.to_list == ["a", "b", "c"]
82+
assert String.splitter(" a b c ", " ", trim: true) |> Enum.take(1) == ["a"]
83+
assert String.splitter(" a b c ", " ", trim: true) |> Enum.take(2) == ["a", "b"]
84+
end
85+
7286
test :split_at do
7387
assert String.split_at("", 0) == {"", ""}
7488
assert String.split_at("", -1) == {"", ""}
@@ -419,7 +433,6 @@ defmodule StringTest do
419433
end
420434

421435
test :starts_with? do
422-
## Normal cases ##
423436
assert String.starts_with? "hello", "he"
424437
assert String.starts_with? "hello", "hello"
425438
assert String.starts_with? "hello", ["hellö", "hell"]
@@ -428,25 +441,9 @@ defmodule StringTest do
428441
refute String.starts_with? "hello", "hellö"
429442
refute String.starts_with? "hello", ["hellö", "goodbye"]
430443
refute String.starts_with? "エリクシア", "仙丹"
431-
432-
## Edge cases ##
433-
assert String.starts_with? "", ""
434-
assert String.starts_with? "", ["", "a"]
435-
assert String.starts_with? "b", ["", "a"]
436-
437-
assert String.starts_with? "abc", ""
438-
assert String.starts_with? "abc", [""]
439-
440-
refute String.starts_with? "", "abc"
441-
refute String.starts_with? "", [" "]
442-
443-
## Sanity checks ##
444-
assert String.starts_with? "", ["", ""]
445-
assert String.starts_with? "abc", ["", ""]
446444
end
447445

448446
test :ends_with? do
449-
## Normal cases ##
450447
assert String.ends_with? "hello", "lo"
451448
assert String.ends_with? "hello", "hello"
452449
assert String.ends_with? "hello", ["hell", "lo", "xx"]
@@ -456,43 +453,15 @@ defmodule StringTest do
456453
refute String.ends_with? "hello", "hellö"
457454
refute String.ends_with? "hello", ["hel", "goodbye"]
458455
refute String.ends_with? "エリクシア", "仙丹"
459-
460-
## Edge cases ##
461-
assert String.ends_with? "", ""
462-
assert String.ends_with? "", ["", "a"]
463-
refute String.ends_with? "", ["a", "b"]
464-
465-
assert String.ends_with? "abc", ""
466-
assert String.ends_with? "abc", ["", "x"]
467-
468-
refute String.ends_with? "", "abc"
469-
refute String.ends_with? "", [" "]
470-
471-
## Sanity checks ##
472-
assert String.ends_with? "", ["", ""]
473-
assert String.ends_with? "abc", ["", ""]
474456
end
475457

476458
test :contains? do
477-
## Normal cases ##
478459
assert String.contains? "elixir of life", "of"
479460
assert String.contains? "エリクシア", "シ"
480461
assert String.contains? "elixir of life", ["mercury", "life"]
481462
refute String.contains? "elixir of life", "death"
482463
refute String.contains? "エリクシア", "仙"
483464
refute String.contains? "elixir of life", ["death", "mercury", "eternal life"]
484-
485-
## Edge cases ##
486-
assert String.contains? "", ""
487-
assert String.contains? "abc", ""
488-
assert String.contains? "abc", ["", "x"]
489-
490-
refute String.contains? "", " "
491-
refute String.contains? "", "a"
492-
493-
## Sanity checks ##
494-
assert String.contains? "", ["", ""]
495-
assert String.contains? "abc", ["", ""]
496465
end
497466

498467
test :to_char_list do

0 commit comments

Comments
 (0)