Skip to content

Commit cd6ad25

Browse files
author
takashi hashida
committed
Don't use Linq
Avoid unnecessary allocation Change RecordBatchManipulator to RecordBatchEnumerator
1 parent 9b6dd27 commit cd6ad25

File tree

3 files changed

+121
-98
lines changed

3 files changed

+121
-98
lines changed

csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs

Lines changed: 63 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
using System.Collections.Generic;
2020
using System.Diagnostics;
2121
using System.IO;
22-
using System.Linq;
2322
using System.Threading;
2423
using System.Threading.Tasks;
2524
using Apache.Arrow.Types;
@@ -115,38 +114,43 @@ private List<IArrowArray> BuildArrays(
115114
ByteBuffer messageBuffer,
116115
Flatbuf.RecordBatch recordBatchMessage)
117116
{
118-
return CreateInner().ToList();
117+
var arrays = new List<IArrowArray>(recordBatchMessage.NodesLength);
119118

120-
IEnumerable<IArrowArray> CreateInner()
119+
if (recordBatchMessage.NodesLength == 0)
121120
{
122-
var recordBatchManipulator = new RecordBatchManipulator(in recordBatchMessage);
121+
return arrays;
122+
}
123+
124+
var recordBatchEnumerator = new RecordBatchEnumerator(in recordBatchMessage);
123125

124-
while (!recordBatchManipulator.IsAllNodeRead)
125-
{
126-
var field = schema.GetFieldByIndex(recordBatchManipulator.CurrentNodeIndex);
127-
Flatbuf.FieldNode fieldNode = recordBatchManipulator.ShiftNode();
126+
do
127+
{
128+
var field = schema.GetFieldByIndex(recordBatchEnumerator.CurrentNodeIndex);
129+
var fieldNode = recordBatchEnumerator.CurrentNode;
128130

129-
var arrayData = field.DataType.IsFixedPrimitive() ?
130-
LoadPrimitiveField(recordBatchManipulator, field, in fieldNode, messageBuffer) :
131-
LoadVariableField(recordBatchManipulator, field, in fieldNode, messageBuffer);
131+
var arrayData = field.DataType.IsFixedPrimitive()
132+
? LoadPrimitiveField(recordBatchEnumerator, field, in fieldNode, messageBuffer)
133+
: LoadVariableField(recordBatchEnumerator, field, in fieldNode, messageBuffer);
132134

133-
yield return ArrowArrayFactory.BuildArray(arrayData);
134-
}
135-
}
135+
arrays.Add(ArrowArrayFactory.BuildArray(arrayData));
136+
} while (recordBatchEnumerator.MoveNextNode());
137+
138+
return arrays;
136139
}
137140

138141

139142
private ArrayData LoadPrimitiveField(
140-
RecordBatchManipulator recordBatchManipulator,
143+
RecordBatchEnumerator recordBatchEnumerator,
141144
Field field,
142145
in Flatbuf.FieldNode fieldNode,
143146
ByteBuffer bodyData)
144147
{
145-
var nullBitmapBuffer = recordBatchManipulator.ShiftBuffer();
146-
var valueBuffer = recordBatchManipulator.ShiftBuffer();
147148

148-
ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, nullBitmapBuffer);
149-
ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, valueBuffer);
149+
ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
150+
recordBatchEnumerator.MoveNextBuffer();
151+
ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
152+
recordBatchEnumerator.MoveNextBuffer();
153+
150154

151155
var fieldLength = (int)fieldNode.Length;
152156
var fieldNullCount = (int)fieldNode.NullCount;
@@ -162,25 +166,25 @@ private ArrayData LoadPrimitiveField(
162166
}
163167

164168
var arrowBuff = new[] { nullArrowBuffer, valueArrowBuffer };
165-
var offspring = GetOffspring(recordBatchManipulator, field, bodyData);
169+
var children = GetChildren(recordBatchEnumerator, field, bodyData);
166170

167-
return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, offspring.ToArray());
171+
return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children);
168172
}
169173

170174

171175
private ArrayData LoadVariableField(
172-
RecordBatchManipulator recordBatchManipulator,
176+
RecordBatchEnumerator recordBatchEnumerator,
173177
Field field,
174178
in Flatbuf.FieldNode fieldNode,
175179
ByteBuffer bodyData)
176180
{
177-
var nullBitmapBuffer = recordBatchManipulator.ShiftBuffer();
178-
var offsetBuffer = recordBatchManipulator.ShiftBuffer();
179-
var valueBuffer = recordBatchManipulator.ShiftBuffer();
180181

181-
ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, nullBitmapBuffer);
182-
ArrowBuffer offsetArrowBuffer = BuildArrowBuffer(bodyData, offsetBuffer);
183-
ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, valueBuffer);
182+
ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
183+
recordBatchEnumerator.MoveNextBuffer();
184+
ArrowBuffer offsetArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
185+
recordBatchEnumerator.MoveNextBuffer();
186+
ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
187+
recordBatchEnumerator.MoveNextBuffer();
184188

185189
var fieldLength = (int)fieldNode.Length;
186190
var fieldNullCount = (int)fieldNode.NullCount;
@@ -196,24 +200,33 @@ private ArrayData LoadVariableField(
196200
}
197201

198202
var arrowBuff = new[] { nullArrowBuffer, offsetArrowBuffer, valueArrowBuffer };
199-
var offspring = GetOffspring(recordBatchManipulator, field, bodyData);
203+
var children = GetChildren(recordBatchEnumerator, field, bodyData);
200204

201-
return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, offspring.ToArray());
205+
return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children);
202206
}
203207

204-
private IEnumerable<ArrayData> GetOffspring(
205-
RecordBatchManipulator recordBatchManipulator,
208+
private ArrayData[] GetChildren(
209+
RecordBatchEnumerator recordBatchEnumerator,
206210
Field field,
207211
ByteBuffer bodyData)
208212
{
209-
if (!(field.DataType is NestedType type)) yield break;
210-
foreach (var childField in type.Children)
213+
if (!(field.DataType is NestedType type)) return null;
214+
215+
var childrenCount = type.ChildrenCount;
216+
var children = new ArrayData[childrenCount];
217+
for (var index = 0; index < childrenCount; index++)
211218
{
212-
Flatbuf.FieldNode childFieldNode = recordBatchManipulator.ShiftNode();
213-
yield return childField.DataType.IsFixedPrimitive()
214-
? LoadPrimitiveField(recordBatchManipulator, childField, in childFieldNode, bodyData)
215-
: LoadVariableField(recordBatchManipulator, childField, in childFieldNode, bodyData);
219+
Flatbuf.FieldNode childFieldNode = recordBatchEnumerator.CurrentNode;
220+
recordBatchEnumerator.MoveNextNode();
221+
222+
var childField = type.Children[index];
223+
var child = childField.DataType.IsFixedPrimitive()
224+
? LoadPrimitiveField(recordBatchEnumerator, childField, in childFieldNode, bodyData)
225+
: LoadVariableField(recordBatchEnumerator, childField, in childFieldNode, bodyData);
226+
227+
children[index] = child;
216228
}
229+
return children;
217230
}
218231

219232
private ArrowBuffer BuildArrowBuffer(ByteBuffer bodyData, Flatbuf.Buffer buffer)
@@ -231,27 +244,29 @@ private ArrowBuffer BuildArrowBuffer(ByteBuffer bodyData, Flatbuf.Buffer buffer)
231244
}
232245
}
233246

234-
class RecordBatchManipulator
247+
internal class RecordBatchEnumerator
235248
{
236-
private int CurrentBufferIndex { get; set; }
237249
private Flatbuf.RecordBatch RecordBatch { get; }
250+
internal int CurrentBufferIndex { get; set; }
238251
internal int CurrentNodeIndex { get; set; }
239-
internal bool IsAllNodeRead => CurrentNodeIndex >= RecordBatch.NodesLength;
240252

241-
internal RecordBatchManipulator(in Flatbuf.RecordBatch recordBatch)
253+
internal Flatbuf.Buffer CurrentBuffer => RecordBatch.Buffers(CurrentBufferIndex).GetValueOrDefault();
254+
255+
internal Flatbuf.FieldNode CurrentNode => RecordBatch.Nodes(CurrentNodeIndex).GetValueOrDefault();
256+
257+
internal bool MoveNextBuffer()
242258
{
243-
RecordBatch = recordBatch;
259+
return ++CurrentBufferIndex < RecordBatch.BuffersLength;
244260
}
245261

246-
internal Flatbuf.Buffer ShiftBuffer()
262+
internal bool MoveNextNode()
247263
{
248-
return RecordBatch.Buffers(CurrentBufferIndex++).GetValueOrDefault();
264+
return ++CurrentNodeIndex < RecordBatch.NodesLength;
249265
}
250266

251-
internal Flatbuf.FieldNode ShiftNode()
267+
internal RecordBatchEnumerator(in Flatbuf.RecordBatch recordBatch)
252268
{
253-
return RecordBatch.Nodes(CurrentNodeIndex++).GetValueOrDefault();
269+
RecordBatch = recordBatch;
254270
}
255-
256271
}
257272
}

csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs

Lines changed: 55 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
using System.Buffers.Binary;
1919
using System.Collections.Generic;
2020
using System.IO;
21-
using System.Linq;
2221
using System.Threading;
2322
using System.Threading.Tasks;
2423
using Apache.Arrow.Types;
@@ -177,47 +176,40 @@ public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOp
177176
_options = options ?? IpcOptions.Default;
178177
}
179178

180-
private VectorOffset CreateFieldVector(IEnumerable<IArrowArray> fieldArrayList)
181-
{
182-
var allArrowArrayList = GetAll(fieldArrayList).ToList();
183-
184-
Flatbuf.RecordBatch.StartNodesVector(Builder, allArrowArrayList.Count);
185179

186-
foreach (var array in allArrowArrayList)
180+
private void CreateSelfAndChildrenFieldNodes(ArrayData data)
181+
{
182+
if (data.DataType is NestedType)
187183
{
188-
Flatbuf.FieldNode.CreateFieldNode(Builder, array.Length, array.NullCount);
184+
// flatbuffer struct vectors have to be created in reverse order
185+
for (var i = data.Children.Length - 1; i >= 0; i--)
186+
{
187+
CreateSelfAndChildrenFieldNodes(data.Children[i]);
188+
}
189189
}
190+
Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.NullCount);
191+
}
190192

191-
return Builder.EndVector();
192-
193-
194-
//Inner methods
195-
196-
IEnumerable<IArrowArray> GetAll(IEnumerable<IArrowArray> targetArrayList)
193+
private int CountAllNodes()
194+
{
195+
var count = 0;
196+
foreach (var arrowArray in Schema.Fields.Values)
197197
{
198-
foreach (var arrowArray in targetArrayList)
199-
{
200-
foreach (var arr in GetSelfAndOffspring(arrowArray))
201-
{
202-
yield return arr;
203-
}
204-
}
198+
CountSelfAndChildrenNodes(arrowArray.DataType, ref count);
205199
}
200+
return count;
201+
}
206202

207-
IEnumerable<IArrowArray> GetSelfAndOffspring(IArrowArray targetArray)
203+
private void CountSelfAndChildrenNodes(IArrowType type, ref int count)
204+
{
205+
if (type is NestedType nestedType)
208206
{
209-
if (targetArray.Data.DataType is NestedType)
207+
foreach (var childField in nestedType.Children)
210208
{
211-
foreach (var child in targetArray.Data.Children)
212-
{
213-
foreach (var offspring in GetSelfAndOffspring(ArrowArrayFactory.BuildArray(child)))
214-
{
215-
yield return offspring;
216-
}
217-
}
209+
CountSelfAndChildrenNodes(childField.DataType, ref count);
218210
}
219-
yield return targetArray;
220211
}
212+
count++;
221213
}
222214

223215
private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBatch,
@@ -234,13 +226,22 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat
234226
Builder.Clear();
235227

236228
// Serialize field nodes
229+
230+
var fieldCount = Schema.Fields.Count;
231+
232+
Flatbuf.RecordBatch.StartNodesVector(Builder, CountAllNodes());
233+
237234
// flatbuffer struct vectors have to be created in reverse order
238-
var fieldNodesVectorOffset = CreateFieldVector(recordBatch.Arrays.Reverse());
235+
for (var i = fieldCount - 1; i >= 0; i--)
236+
{
237+
CreateSelfAndChildrenFieldNodes(recordBatch.Column(i).Data);
238+
}
239+
240+
var fieldNodesVectorOffset = Builder.EndVector();
239241

240242
// Serialize buffers
241-
var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder();
242243

243-
var fieldCount = Schema.Fields.Count;
244+
var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder();
244245
for (var i = 0; i < fieldCount; i++)
245246
{
246247
var fieldArray = recordBatch.Column(i);
@@ -349,12 +350,11 @@ private ValueTask WriteBufferAsync(ArrowBuffer arrowBuffer, CancellationToken ca
349350
var fieldNameOffset = Builder.CreateString(field.Name);
350351
var fieldType = _fieldTypeBuilder.BuildFieldType(field);
351352

352-
var fieldChildren = GetChildrenFieldOffset(field).ToArray();
353-
var fieldChildrenOffsets = Builder.CreateVectorOfTables(fieldChildren.ToArray());
353+
var fieldChildrenVectorOffset = Builder.CreateVectorOfTables(GetChildrenFieldOffsets(field));
354354

355355
fieldOffsets[i] = Flatbuf.Field.CreateField(Builder,
356356
fieldNameOffset, field.IsNullable, fieldType.Type, fieldType.Offset,
357-
default, fieldChildrenOffsets, default);
357+
default, fieldChildrenVectorOffset, default);
358358
}
359359

360360
var fieldsVectorOffset = Flatbuf.Schema.CreateFieldsVector(Builder, fieldOffsets);
@@ -367,19 +367,28 @@ private ValueTask WriteBufferAsync(ArrowBuffer arrowBuffer, CancellationToken ca
367367
Builder, endianness, fieldsVectorOffset);
368368
}
369369

370-
private protected IEnumerable<Offset<Flatbuf.Field>> GetChildrenFieldOffset(Field field)
370+
private protected Offset<Flatbuf.Field>[] GetChildrenFieldOffsets(Field field)
371371
{
372-
if (!(field.DataType is NestedType type)) yield break;
373-
foreach (var child in type.Children)
372+
if (!(field.DataType is NestedType type))
374373
{
375-
var fieldNameOffset = Builder.CreateString(child.Name);
376-
var fieldType = _fieldTypeBuilder.BuildFieldType(child);
377-
var fieldChildrenOffsets = Builder.CreateVectorOfTables(GetChildrenFieldOffset(child).ToArray());
374+
return new Offset<Flatbuf.Field>[0];
375+
}
376+
377+
var childrenCount = type.ChildrenCount;
378+
var children = new Offset<Flatbuf.Field>[childrenCount];
378379

379-
yield return Flatbuf.Field.CreateField(Builder,
380-
fieldNameOffset, child.IsNullable, fieldType.Type, fieldType.Offset,
381-
default, fieldChildrenOffsets, default);
380+
for (var i = 0; i < childrenCount; i++)
381+
{
382+
var childField = type.Children[i];
383+
var childFieldNameOffset = Builder.CreateString(childField.Name);
384+
var childFieldType = _fieldTypeBuilder.BuildFieldType(childField);
385+
var childFieldChildrenVectorOffset = Builder.CreateVectorOfTables(GetChildrenFieldOffsets(childField));
386+
387+
children[i] = Flatbuf.Field.CreateField(Builder,
388+
childFieldNameOffset, childField.IsNullable, childFieldType.Type, childFieldType.Offset,
389+
default, childFieldChildrenVectorOffset, default);
382390
}
391+
return children;
383392
}
384393

385394
private async ValueTask<Offset<Flatbuf.Schema>> WriteSchemaAsync(Schema schema, CancellationToken cancellationToken)

csharp/test/Apache.Arrow.Tests/TestData.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
using Apache.Arrow.Types;
1717
using System;
1818
using System.Collections.Generic;
19-
using System.Linq;
2019

2120
namespace Apache.Arrow.Tests
2221
{
@@ -86,7 +85,7 @@ private static IArrowArray CreateArray(Field field, int length)
8685
field.DataType.Accept(creator);
8786

8887
ArrayData data = new ArrayData(field.DataType, length, 0, 0,
89-
new[] { ArrowBuffer.Empty, creator.Buffer }, creator.Children?.Select(_ => _.Data));
88+
new[] { ArrowBuffer.Empty, creator.Buffer }, creator.Children);
9089

9190
return ArrowArrayFactory.BuildArray(data);
9291
}
@@ -110,7 +109,7 @@ private class ArrayBufferCreator :
110109
private readonly int _length;
111110
public ArrowBuffer Buffer { get; private set; }
112111

113-
public List<IArrowArray> Children { get; private set; }
112+
public ArrayData[] Children { get; private set; }
114113

115114
public ArrayBufferCreator(int length)
116115
{
@@ -174,7 +173,7 @@ public void Visit(UInt16Type type)
174173
public void Visit(ListType type)
175174
{
176175
//Buffer is valueOffsetsBuffer
177-
Children = new List<IArrowArray> { CreateArray(type.ValueField, _length) };
176+
Children = new[] { CreateArray(type.ValueField, _length).Data };
178177
ArrowBuffer.Builder<int> builder = new ArrowBuffer.Builder<int>(_length);
179178
for (int i = 0; i < _length; i++)
180179
builder.Append(i);

0 commit comments

Comments
 (0)