Skip to content

Commit 9846cfe

Browse files
danieldkDaniël de Kok
authored andcommitted
Writechunk::write_chunk: use chunk_len
1 parent fdd79e3 commit 9846cfe

File tree

5 files changed

+41
-98
lines changed

5 files changed

+41
-98
lines changed

src/chunks/norms.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,13 @@ impl WriteChunk for NdNorms {
113113
Error::write_error("Cannot get file position for computing padding", e)
114114
})?);
115115

116-
// Chunk size: len (u64), type id (u32), padding ([0,4) bytes), vector.
117-
let chunk_len = size_of::<u64>()
118-
+ size_of::<u32>()
119-
+ n_padding as usize
120-
+ (self.len() * size_of::<f32>());
116+
let remaining_chunk_len =
117+
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
118+
Error::read_error("Cannot get file position for computing padding", e)
119+
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;
120+
121121
write
122-
.write_u64::<LittleEndian>(chunk_len as u64)
122+
.write_u64::<LittleEndian>(remaining_chunk_len)
123123
.map_err(|e| Error::write_error("Cannot write norms chunk length", e))?;
124124
write
125125
.write_u64::<LittleEndian>(self.len() as u64)

src/chunks/storage/array.rs

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,16 @@ impl NdArray {
221221
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
222222
Error::write_error("Cannot get file position for computing padding", e)
223223
})?);
224-
// Chunk size: rows (u64), columns (u32), type id (u32),
225-
// padding ([0,4) bytes), matrix.
226-
let chunk_len = size_of::<u64>()
227-
+ size_of::<u32>()
228-
+ size_of::<u32>()
229-
+ n_padding as usize
230-
+ (data.nrows() * data.ncols() * size_of::<f32>());
224+
225+
let remaining_chunk_len = Self::chunk_len(
226+
data.view(),
227+
write.seek(SeekFrom::Current(0)).map_err(|e| {
228+
Error::read_error("Cannot get file position for computing padding", e)
229+
})?,
230+
) - (size_of::<u32>() + size_of::<u64>()) as u64;
231+
231232
write
232-
.write_u64::<LittleEndian>(chunk_len as u64)
233+
.write_u64::<LittleEndian>(remaining_chunk_len)
233234
.map_err(|e| Error::write_error("Cannot write embedding matrix chunk length", e))?;
234235
write
235236
.write_u64::<LittleEndian>(data.nrows() as u64)

src/chunks/storage/quantized.rs

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -167,34 +167,21 @@ impl QuantizedArray {
167167
)
168168
})?;
169169

170-
// projection (u32), use_norms (u32), quantized_len (u32),
171-
// reconstructed_len (u32), n_centroids (u32), rows (u64),
172-
// types (2 x u32 bytes), padding, projection matrix,
173-
// centroids, norms, quantized data.
170+
let remaining_chunk_len = Self::chunk_len_(
171+
quantizer,
172+
quantized.view(),
173+
norms,
174+
write.seek(SeekFrom::Current(0)).map_err(|e| {
175+
Error::read_error("Cannot get file position for computing padding", e)
176+
})?,
177+
) - (size_of::<u32>() + size_of::<u64>()) as u64;
178+
174179
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
175180
Error::write_error("Cannot get file position for computing padding", e)
176181
})?);
177-
let chunk_size = size_of::<u32>()
178-
+ size_of::<u32>()
179-
+ size_of::<u32>()
180-
+ size_of::<u32>()
181-
+ size_of::<u32>()
182-
+ size_of::<u64>()
183-
+ 2 * size_of::<u32>()
184-
+ n_padding as usize
185-
+ quantizer.projection().is_some() as usize
186-
* quantizer.reconstructed_len()
187-
* quantizer.reconstructed_len()
188-
* size_of::<f32>()
189-
+ quantizer.quantized_len()
190-
* quantizer.n_quantizer_centroids()
191-
* (quantizer.reconstructed_len() / quantizer.quantized_len())
192-
* size_of::<f32>()
193-
+ norms.is_some() as usize * quantized.nrows() * size_of::<f32>()
194-
+ quantized.nrows() * quantizer.quantized_len();
195182

196183
write
197-
.write_u64::<LittleEndian>(chunk_size as u64)
184+
.write_u64::<LittleEndian>(remaining_chunk_len)
198185
.map_err(|e| {
199186
Error::write_error("Cannot write quantized embedding matrix chunk length", e)
200187
})?;

src/chunks/vocab/simple.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::collections::HashMap;
22
use std::convert::TryInto;
3-
use std::io::{Read, Seek, Write};
3+
use std::io::{Read, Seek, SeekFrom, Write};
44
use std::mem::size_of;
55

66
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
@@ -98,20 +98,17 @@ impl WriteChunk for SimpleVocab {
9898
where
9999
W: Write + Seek,
100100
{
101-
// Chunk size: vocabulary size (u64), for each word:
102-
// word length in bytes (4 bytes), word bytes (variable-length).
103-
let chunk_len = size_of::<u64>()
104-
+ self
105-
.words
106-
.iter()
107-
.map(|w| w.len() + size_of::<u32>())
108-
.sum::<usize>();
109-
110101
write
111102
.write_u32::<LittleEndian>(ChunkIdentifier::SimpleVocab as u32)
112103
.map_err(|e| Error::write_error("Cannot write vocabulary chunk identifier", e))?;
104+
105+
let remaining_chunk_len =
106+
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
107+
Error::read_error("Cannot get file position for computing padding", e)
108+
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;
109+
113110
write
114-
.write_u64::<LittleEndian>(chunk_len as u64)
111+
.write_u64::<LittleEndian>(remaining_chunk_len)
115112
.map_err(|e| Error::write_error("Cannot write vocabulary chunk length", e))?;
116113
write
117114
.write_u64::<LittleEndian>(self.words.len() as u64)

src/chunks/vocab/subword.rs

Lines changed: 8 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -418,27 +418,16 @@ where
418418
where
419419
W: Write + Seek,
420420
{
421-
// Chunk size: vocab size (u64), minimum n-gram length (u32),
422-
// maximum n-gram length (u32), bucket exponent (u32), for
423-
// each word: word length in bytes (u32), word bytes
424-
// (variable-length).
425-
let chunk_len = size_of::<u64>()
426-
+ size_of::<u32>()
427-
+ size_of::<u32>()
428-
+ size_of::<u32>()
429-
+ self
430-
.words()
431-
.iter()
432-
.map(|w| w.len() + size_of::<u32>())
433-
.sum::<usize>();
434-
435421
write
436422
.write_u32::<LittleEndian>(chunk_identifier as u32)
437423
.map_err(|e| {
438424
Error::write_error("Cannot write subword vocabulary chunk identifier", e)
439425
})?;
426+
427+
let remaining_chunk_len = self.chunk_len_() - (size_of::<u32>() + size_of::<u64>()) as u64;
428+
440429
write
441-
.write_u64::<LittleEndian>(chunk_len as u64)
430+
.write_u64::<LittleEndian>(remaining_chunk_len)
442431
.map_err(|e| Error::write_error("Cannot write subword vocabulary chunk length", e))?;
443432
write
444433
.write_u64::<LittleEndian>(self.words.len() as u64)
@@ -563,34 +552,15 @@ impl ExplicitSubwordVocab {
563552
where
564553
W: Write + Seek,
565554
{
566-
// Chunk size: word vocab size (u64), ngram vocab size (u64)
567-
// minimum n-gram length (u32), maximum n-gram length (u32),
568-
// for each word and ngram:
569-
// length in bytes (u32), number of bytes (variable-length).
570-
// each ngram is followed by its index (u64)
571-
let chunk_len = size_of::<u64>()
572-
+ size_of::<u64>()
573-
+ size_of::<u32>()
574-
+ size_of::<u32>()
575-
+ self
576-
.words()
577-
.iter()
578-
.map(|w| w.len() + size_of::<u32>())
579-
.sum::<usize>()
580-
+ self
581-
.indexer
582-
.ngrams()
583-
.iter()
584-
.map(|ngram| ngram.len() + size_of::<u32>() + size_of::<u64>())
585-
.sum::<usize>();
555+
let remaining_chunk_len = self.chunk_len_() - (size_of::<u32>() + size_of::<u64>()) as u64;
586556

587557
write
588558
.write_u32::<LittleEndian>(chunk_identifier as u32)
589559
.map_err(|e| {
590560
Error::write_error("Cannot write subword vocabulary chunk identifier", e)
591561
})?;
592562
write
593-
.write_u64::<LittleEndian>(chunk_len as u64)
563+
.write_u64::<LittleEndian>(remaining_chunk_len)
594564
.map_err(|e| Error::write_error("Cannot write subword vocabulary chunk length", e))?;
595565
write
596566
.write_u64::<LittleEndian>(self.words.len() as u64)
@@ -685,27 +655,15 @@ impl FloretSubwordVocab {
685655
where
686656
W: Write + Seek,
687657
{
688-
// Chunk size: minimum n-gram length (u32), maximum n-gram length (u32),
689-
// number of buckets (u64), number of hashes (u32), hash seed (u32),
690-
// bow and row (variable length).
691-
692-
let chunk_len = size_of::<u32>()
693-
+ size_of::<u32>()
694-
+ size_of::<u64>()
695-
+ size_of::<u32>()
696-
+ size_of::<u32>()
697-
+ self.bow.len()
698-
+ size_of::<u32>()
699-
+ self.eow.len()
700-
+ size_of::<u32>();
658+
let remaining_chunk_len = self.chunk_len_() - (size_of::<u32>() + size_of::<u64>()) as u64;
701659

702660
write
703661
.write_u32::<LittleEndian>(chunk_identifier as u32)
704662
.map_err(|e| {
705663
Error::write_error("Cannot write subword vocabulary chunk identifier", e)
706664
})?;
707665
write
708-
.write_u64::<LittleEndian>(chunk_len as u64)
666+
.write_u64::<LittleEndian>(remaining_chunk_len)
709667
.map_err(|e| Error::write_error("Cannot write subword vocabulary chunk length", e))?;
710668
write
711669
.write_u32::<LittleEndian>(self.min_n)

0 commit comments

Comments
 (0)