Skip to content

Commit 3ca26ee

Browse files
Morten Grouleffluben
Morten Grouleff
authored andcommitted
Allow including compression level when training a dictionary: The compression improves when the level for the training is close to the level for the compression step. (100% Compatible for the public API.)
1 parent eea07fc commit 3ca26ee

File tree

4 files changed

+49
-12
lines changed

4 files changed

+49
-12
lines changed

src/main/java/com/github/luben/zstd/Zstd.java

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -862,12 +862,26 @@ public static long getDirectByteBufferFrameContentSize(ByteBuffer src, int srcPo
862862
* it fails (which can be tested using ZSTD_isError())
863863
*/
864864
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy) {
865+
return trainFromBuffer(samples, dictBuffer, legacy, defaultCompressionLevel());
866+
}
867+
868+
/**
869+
* Creates a new dictionary to tune a kind of samples
870+
*
871+
* @param samples the samples buffer array
872+
* @param dictBuffer the new dictionary buffer
873+
* @param legacy use the legacy training algorithm; otherwise cover
874+
* @param compressionLevel optimal if using the same level as when compressing.
875+
* @return the number of bytes into buffer 'dictBuffer' or an error code if
876+
* it fails (which can be tested using ZSTD_isError())
877+
*/
878+
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel) {
865879
if (samples.length <= 10) {
866880
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
867881
}
868-
return trainFromBuffer0(samples, dictBuffer, legacy);
882+
return trainFromBuffer0(samples, dictBuffer, legacy, compressionLevel);
869883
}
870-
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy);
884+
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel);
871885

872886
/**
873887
* Creates a new dictionary to tune a kind of samples
@@ -880,12 +894,29 @@ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean
880894
* it fails (which can be tested using ZSTD_isError())
881895
*/
882896
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy) {
897+
return trainFromBufferDirect(samples, sampleSizes, dictBuffer, legacy, defaultCompressionLevel());
898+
}
899+
900+
/**
901+
* Creates a new dictionary to tune a kind of samples
902+
*
903+
* @param samples the samples direct byte buffer array
904+
* @param sampleSizes java integer array of sizes
905+
* @param dictBuffer the new dictionary buffer (preallocated direct byte buffer)
906+
* @param legacy use the legacy training algorithm; oter
907+
* @param compressionLevel optimal if using the same level as when compressing.
908+
* @return the number of bytes into buffer 'dictBuffer' or an error code if
909+
* it fails (which can be tested using ZSTD_isError())
910+
*/
911+
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel) {
883912
if (sampleSizes.length <= 10) {
884913
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
885914
}
886-
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy);
915+
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy, compressionLevel);
887916
}
888-
private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy);
917+
918+
919+
private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel);
889920

890921
/**
891922
* Get DictId from a compressed frame

src/main/native/dictBuilder/zdict.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,15 +1105,18 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
11051105

11061106

11071107
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1108-
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1108+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, int compressionLevel)
11091109
{
11101110
ZDICT_fastCover_params_t params;
11111111
DEBUGLOG(3, "ZDICT_trainFromBuffer");
11121112
memset(&params, 0, sizeof(params));
11131113
params.d = 8;
11141114
params.steps = 4;
1115-
/* Use default level since no compression level information is available */
1116-
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1115+
if (compressionLevel <= 0) {
1116+
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1117+
} else {
1118+
params.zParams.compressionLevel = compressionLevel;
1119+
}
11171120
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
11181121
params.zParams.notificationLevel = DEBUGLEVEL;
11191122
#endif

src/main/native/jni_zdict.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include <string.h>
99

1010
JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
11-
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy) {
11+
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy, jint compressionLevel) {
1212
size_t size = 0;
1313
jsize num_samples = (*env)->GetArrayLength(env, samples);
1414
size_t *samples_sizes = malloc(sizeof(size_t) * num_samples);
@@ -45,9 +45,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
4545
if (legacy == JNI_TRUE) {
4646
ZDICT_legacy_params_t params;
4747
memset(&params,0,sizeof(params));
48+
params.zParams.compressionLevel = compressionLevel;
4849
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
4950
} else {
50-
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
51+
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
5152
}
5253
(*env)->ReleasePrimitiveArrayCritical(env, dictBuffer, dict_buff, 0);
5354
free(samples_buffer);
@@ -56,7 +57,7 @@ E1: return size;
5657
}
5758

5859
JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
59-
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy) {
60+
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy, jint compressionLevel) {
6061

6162
size_t size = 0;
6263
void *samples_buffer = (*env)->GetDirectBufferAddress(env, samples);
@@ -81,9 +82,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
8182
if (legacy == JNI_TRUE) {
8283
ZDICT_legacy_params_t params;
8384
memset(&params, 0, sizeof(params));
85+
params.zParams.compressionLevel = compressionLevel;
8486
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
8587
} else {
86-
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
88+
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
8789
}
8890
E2: free(samples_sizes);
8991
E1: return size;

src/main/native/zdict.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ extern "C" {
209209
*/
210210
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
211211
const void* samplesBuffer,
212-
const size_t* samplesSizes, unsigned nbSamples);
212+
const size_t* samplesSizes, unsigned nbSamples,
213+
int compressionLevel);
213214

214215
typedef struct {
215216
int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */

0 commit comments

Comments
 (0)