Allow including compression level when training a dictionary: The compression improves when the level for the training is close to the level for the compression step. (100% Compatible for the public API.)

Morten Grouleff · luben · commit 3ca26eed6c84 · 2024-09-22T14:06:50.000+01:00
diff --git a/src/main/java/com/github/luben/zstd/Zstd.java b/src/main/java/com/github/luben/zstd/Zstd.java
@@ -862,12 +862,26 @@ public static long getDirectByteBufferFrameContentSize(ByteBuffer src, int srcPo
      *          it fails (which can be tested using ZSTD_isError())
      */
     public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy) {
+        return trainFromBuffer(samples, dictBuffer, legacy, defaultCompressionLevel());
+    }
+
+    /**
+     * Creates a new dictionary to tune a kind of samples
+     *
+     * @param samples the samples buffer array
+     * @param dictBuffer the new dictionary buffer
+     * @param legacy  use the legacy training algorithm; otherwise cover
+     * @param compressionLevel  optimal if using the same level as when compressing.
+     * @return the number of bytes into buffer 'dictBuffer' or an error code if
+     *          it fails (which can be tested using ZSTD_isError())
+     */
+    public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel) {
         if (samples.length <= 10) {
             throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
         }
-        return trainFromBuffer0(samples, dictBuffer, legacy);
+        return trainFromBuffer0(samples, dictBuffer, legacy, compressionLevel);
     }
-    private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy);
+    private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel);
 
     /**
      * Creates a new dictionary to tune a kind of samples
@@ -880,12 +894,29 @@ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean
      *          it fails (which can be tested using ZSTD_isError())
      */
     public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy) {
+	return trainFromBufferDirect(samples, sampleSizes, dictBuffer, legacy, defaultCompressionLevel());
+    }
+
+    /**
+     * Creates a new dictionary to tune a kind of samples
+     *
+     * @param samples the samples direct byte buffer array
+     * @param sampleSizes java integer array of sizes
+     * @param dictBuffer the new dictionary buffer (preallocated direct byte buffer)
+     * @param legacy  use the legacy training algorithm; oter
+     * @param compressionLevel  optimal if using the same level as when compressing.
+     * @return the number of bytes into buffer 'dictBuffer' or an error code if
+     *          it fails (which can be tested using ZSTD_isError())
+     */
+    public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel) {
         if (sampleSizes.length <= 10) {
             throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
         }
-        return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy);
+        return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy, compressionLevel);
     }
-    private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy);
+
+
+    private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel);
 
     /**
      * Get DictId from a compressed frame
diff --git a/src/main/native/dictBuilder/zdict.c b/src/main/native/dictBuilder/zdict.c
@@ -1105,15 +1105,18 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
 
 
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, int compressionLevel)
 {
     ZDICT_fastCover_params_t params;
     DEBUGLOG(3, "ZDICT_trainFromBuffer");
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
-    /* Use default level since no compression level information is available */
-    params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    if (compressionLevel <= 0) {
+        params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    } else {
+        params.zParams.compressionLevel = compressionLevel;
+    }
 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
     params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
diff --git a/src/main/native/jni_zdict.c b/src/main/native/jni_zdict.c
@@ -8,7 +8,7 @@
 #include <string.h>
 
 JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
-  (JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy) {
+  (JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy, jint compressionLevel) {
     size_t size = 0;
     jsize num_samples = (*env)->GetArrayLength(env, samples);
     size_t *samples_sizes = malloc(sizeof(size_t) * num_samples);
@@ -45,9 +45,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
     if (legacy == JNI_TRUE) {
         ZDICT_legacy_params_t params;
         memset(&params,0,sizeof(params));
+        params.zParams.compressionLevel = compressionLevel;
         size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
     } else {
-        size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
+        size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
     }
     (*env)->ReleasePrimitiveArrayCritical(env, dictBuffer, dict_buff, 0);
     free(samples_buffer);
@@ -56,7 +57,7 @@ E1: return size;
 }
 
 JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
-  (JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy) {
+  (JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy, jint compressionLevel) {
 
     size_t size = 0;
     void *samples_buffer = (*env)->GetDirectBufferAddress(env, samples);
@@ -81,9 +82,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
     if (legacy == JNI_TRUE) {
         ZDICT_legacy_params_t params;
         memset(&params, 0, sizeof(params));
+        params.zParams.compressionLevel = compressionLevel;
         size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
     } else {
-        size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
+        size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
     }
 E2: free(samples_sizes);
 E1: return size;
diff --git a/src/main/native/zdict.h b/src/main/native/zdict.h
@@ -209,7 +209,8 @@ extern "C" {
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                                     const void* samplesBuffer,
-                                    const size_t* samplesSizes, unsigned nbSamples);
+                                    const size_t* samplesSizes, unsigned nbSamples,
+                                    int compressionLevel);
 
 typedef struct {
     int      compressionLevel;   /**< optimize for a specific zstd compression level; 0 means default */