Skip to content

Commit c57a652

Browse files
committed
adds splitBy extension method to scala collections
`Iterator#splitBy` constructs an iterator where consecutive elements of the original iterator are accumulated as long as the output of a key function for each element doesn't change. This operation makes sense as soon as you are trying to process an iterator where you know the elements will be sorted in a certain way and you need to group them without loading all the data in memory. For instance * processing a file where the ordering is guaranteed but the file doesn't fit in the heap, * processing a streaming resultset where the underlying database guarantees the ordering because of a sort clause. The same operation is added to `Iterable` with the difference that the specific container type of the input is preserved for both collection levels of the output, thus * `Set(1,2,3).splitBy(identity)` returns `Set(Set(1), Set(2), Set(3))` * `Vector(1,2,3).splitBy(identity)` returns `Vector(Vector1), Vector2), Vector3))` * etc.
1 parent 1a46759 commit c57a652

File tree

4 files changed

+171
-1
lines changed

4 files changed

+171
-1
lines changed

src/main/scala/scala/collection/decorators/IterableDecorator.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,28 @@ class IterableDecorator[C, I <: IsIterable[C]](coll: C)(implicit val it: I) {
3333
def lazyFoldRight[B](z: B)(op: it.A => Either[B, B => B]): B =
3434
it(coll).iterator.lazyFoldRight(z)(op)
3535

36+
37+
/**
38+
* Constructs a collection where consecutive elements are accumulated as
39+
* long as the output of f for each element doesn't change.
40+
* <pre>
41+
* Vector(1,2,2,3,3,3,2,2)
42+
* .splitBy(identity)
43+
* </pre>
44+
* produces
45+
* <pre>
46+
* Vector(Vector(1),
47+
* Vector(2,2),
48+
* Vector(3,3,3),
49+
* Vector(2,2))
50+
* </pre>
51+
*
52+
* @param f the function to compute a key for an element
53+
* @tparam K the type of the computed key
54+
* @return a collection of collections of the consecutive elements with the
55+
* same key in the original collection
56+
*/
57+
def splitBy[K, CC1, CC2](f: it.A => K)(implicit bf: BuildFrom[C, it.A, CC1], bff: BuildFrom[C, CC1, CC2]): CC2 = {
58+
bff.fromSpecific(coll)(it(coll).iterator.splitBy(f).map(bf.fromSpecific(coll)))
59+
}
3660
}

src/main/scala/scala/collection/decorators/IteratorDecorator.scala

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,68 @@ class IteratorDecorator[A](val `this`: Iterator[A]) extends AnyVal {
7272
loop(immutable.List.empty)
7373
}
7474

75+
/**
76+
* Constructs an iterator where consecutive elements are accumulated as
77+
* long as the output of f for each element doesn't change.
78+
* <pre>
79+
* Vector(1,2,2,3,3,3,2,2)
80+
* .iterator
81+
* .splitBy(identity)
82+
* .toList
83+
* </pre>
84+
* produces
85+
* <pre>
86+
* List(Seq(1),
87+
* Seq(2,2),
88+
* Seq(3,3,3),
89+
* Seq(2,2))
90+
* </pre>
91+
*
92+
* @param f the function to compute a key for an element
93+
* @tparam K the type of the computed key
94+
* @return an iterator of sequences of the consecutive elements with the
95+
* same key in the original iterator
96+
*/
97+
def splitBy[K](f: A => K): Iterator[immutable.Seq[A]] =
98+
new AbstractIterator[immutable.Seq[A]] {
99+
private var hd: A = _
100+
private var hdKey: K = _
101+
private var hdDefined: Boolean = false
102+
103+
override def hasNext: Boolean = hdDefined || `this`.hasNext
104+
105+
override def next(): immutable.Seq[A] = {
106+
if (hasNext) {
107+
val seq = Vector.newBuilder[A]
108+
if (hdDefined) {
109+
seq += hd
110+
} else {
111+
val init = `this`.next()
112+
hd = init
113+
hdKey = f(init)
114+
hdDefined = true
115+
seq += init
116+
}
117+
var hadSameKey = true
118+
while (`this`.hasNext && hadSameKey) {
119+
val el = `this`.next()
120+
hdDefined = true
121+
val key = f(el)
122+
if (key == hdKey) {
123+
seq += el
124+
} else {
125+
hadSameKey = false
126+
hdKey = key
127+
hd = el
128+
}
129+
}
130+
if (hadSameKey) {
131+
hdDefined = false
132+
}
133+
seq.result()
134+
} else {
135+
Iterator.empty.next()
136+
}
137+
}
138+
}
75139
}

src/test/scala/scala/collection/decorators/IterableDecoratorTest.scala

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ package scala.collection
22
package decorators
33

44
import org.junit.{Assert, Test}
5-
import scala.collection.immutable.{LazyList, List, Range, Map}
5+
6+
import scala.collection.immutable.{LazyList, List, Map, Range}
67

78
class IterableDecoratorTest {
89

@@ -36,4 +37,44 @@ class IterableDecoratorTest {
3637
Assert.assertEquals(2, result2)
3738
}
3839

40+
@Test
41+
def splitByShouldHonorEmptyIterator(): Unit = {
42+
val split = Vector.empty[Int].splitBy(identity)
43+
Assert.assertEquals(Vector.empty, split)
44+
}
45+
46+
@Test
47+
def splitByShouldReturnSingleSeqWhenSingleElement(): Unit = {
48+
val value = Vector("1")
49+
val split = value.splitBy(identity)
50+
Assert.assertEquals(Vector(value), split)
51+
}
52+
53+
@Test
54+
def splitByShouldReturnSingleSeqWhenAllElHaveTheSameKey(): Unit = {
55+
val value = Vector("1", "1", "1")
56+
val split = value.splitBy(identity)
57+
Assert.assertEquals(Vector(value), split)
58+
}
59+
60+
@Test
61+
def splitByShouldReturnVectorOfVectorOrConsecutiveElementsWithTheSameKey(): Unit = {
62+
val value = Vector("1", "2", "2", "3", "3", "3", "2", "2")
63+
val split: Vector[Vector[String]] = value.splitBy(identity)
64+
Assert.assertEquals(Vector(Vector("1"), Vector("2", "2"), Vector("3", "3", "3"), Vector("2", "2")), split)
65+
}
66+
67+
@Test
68+
def splitByShouldReturnListOfListOfConsecutiveElementsWithTheSameKey(): Unit = {
69+
val value = List("1", "2", "2", "3", "3", "3", "2", "2")
70+
val split: List[List[String]] = value.splitBy(identity)
71+
Assert.assertEquals(List(List("1"), List("2", "2"), List("3", "3", "3"), List("2", "2")), split)
72+
}
73+
74+
@Test
75+
def splitByShouldReturnSetOfSetOfConsecutiveElementsWithTheSameKey(): Unit = {
76+
val value = Set("1", "2", "2", "3", "3", "3", "2", "2")
77+
val split: Set[Set[String]] = value.splitBy(identity)
78+
Assert.assertEquals(Set(Set("1"), Set("2"), Set("3")), split)
79+
}
3980
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package scala.collection
2+
package decorators
3+
4+
import org.junit.{Assert, Test}
5+
6+
import scala.util.Try
7+
8+
class IteratorDecoratorTest {
9+
@Test
10+
def splitByShouldHonorEmptyIterator(): Unit = {
11+
val groupedIterator = Iterator.empty.splitBy(identity)
12+
Assert.assertFalse(groupedIterator.hasNext)
13+
Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString)
14+
}
15+
16+
@Test
17+
def splitByShouldReturnIteratorOfSingleSeqWhenAllElHaveTheSameKey(): Unit = {
18+
val value = Vector("1", "1", "1")
19+
val groupedIterator = value.iterator.splitBy(identity)
20+
Assert.assertTrue(groupedIterator.hasNext)
21+
Assert.assertEquals(groupedIterator.next.toVector, value)
22+
Assert.assertFalse(groupedIterator.hasNext)
23+
Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString)
24+
}
25+
26+
@Test
27+
def splitByShouldReturnIteratorOfSeqOfConsecutiveElementsWithTheSameKey(): Unit = {
28+
val value = Vector("1", "2", "2", "3", "3", "3", "2", "2")
29+
val groupedIterator = value.iterator.splitBy(identity)
30+
Assert.assertTrue(groupedIterator.hasNext)
31+
Assert.assertEquals(groupedIterator.next.toVector, Vector("1"))
32+
Assert.assertTrue(groupedIterator.hasNext)
33+
Assert.assertEquals(groupedIterator.next.toVector, Vector("2", "2"))
34+
Assert.assertTrue(groupedIterator.hasNext)
35+
Assert.assertEquals(groupedIterator.next.toVector, Vector("3", "3", "3"))
36+
Assert.assertTrue(groupedIterator.hasNext)
37+
Assert.assertEquals(groupedIterator.next.toVector, Vector("2", "2"))
38+
Assert.assertFalse(groupedIterator.hasNext)
39+
Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString)
40+
}
41+
}

0 commit comments

Comments
 (0)