Updated the SparseVectorStrategy class to use sparse_vector query (#2657)

miguelgrinberg · web-flow · commit 2110df0a1a46 · 2025-01-27T12:32:39.000Z
diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py
@@ -96,7 +96,7 @@ def needs_inference(self) -> bool:
 
 
 class AsyncSparseVectorStrategy(AsyncRetrievalStrategy):
-    """Sparse retrieval strategy using the `text_expansion` processor."""
+    """Sparse retrieval strategy using the `sparse_vector` processor."""
 
     def __init__(self, model_id: str = ".elser_model_2"):
         self.model_id = model_id
@@ -127,11 +127,10 @@ def es_query(
                 "bool": {
                     "must": [
                         {
-                            "text_expansion": {
-                                f"{vector_field}.{self._tokens_field}": {
-                                    "model_id": self.model_id,
-                                    "model_text": query,
-                                }
+                            "sparse_vector": {
+                                "field": f"{vector_field}.{self._tokens_field}",
+                                "inference_id": self.model_id,
+                                "query": query,
                             }
                         }
                     ],
@@ -150,7 +149,7 @@ def es_mappings_settings(
         mappings: Dict[str, Any] = {
             "properties": {
                 vector_field: {
-                    "properties": {self._tokens_field: {"type": "rank_features"}}
+                    "properties": {self._tokens_field: {"type": "sparse_vector"}}
                 }
             }
         }
@@ -172,11 +171,12 @@ async def before_index_creation(
                     {
                         "inference": {
                             "model_id": self.model_id,
-                            "target_field": vector_field,
-                            "field_map": {text_field: "text_field"},
-                            "inference_config": {
-                                "text_expansion": {"results_field": self._tokens_field}
-                            },
+                            "input_output": [
+                                {
+                                    "input_field": text_field,
+                                    "output_field": f"{vector_field}.{self._tokens_field}",
+                                },
+                            ],
                         }
                     }
                 ],
diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py
@@ -96,7 +96,7 @@ def needs_inference(self) -> bool:
 
 
 class SparseVectorStrategy(RetrievalStrategy):
-    """Sparse retrieval strategy using the `text_expansion` processor."""
+    """Sparse retrieval strategy using the `sparse_vector` processor."""
 
     def __init__(self, model_id: str = ".elser_model_2"):
         self.model_id = model_id
@@ -127,11 +127,10 @@ def es_query(
                 "bool": {
                     "must": [
                         {
-                            "text_expansion": {
-                                f"{vector_field}.{self._tokens_field}": {
-                                    "model_id": self.model_id,
-                                    "model_text": query,
-                                }
+                            "sparse_vector": {
+                                "field": f"{vector_field}.{self._tokens_field}",
+                                "inference_id": self.model_id,
+                                "query": query,
                             }
                         }
                     ],
@@ -150,7 +149,7 @@ def es_mappings_settings(
         mappings: Dict[str, Any] = {
             "properties": {
                 vector_field: {
-                    "properties": {self._tokens_field: {"type": "rank_features"}}
+                    "properties": {self._tokens_field: {"type": "sparse_vector"}}
                 }
             }
         }
@@ -172,11 +171,12 @@ def before_index_creation(
                     {
                         "inference": {
                             "model_id": self.model_id,
-                            "target_field": vector_field,
-                            "field_map": {text_field: "text_field"},
-                            "inference_config": {
-                                "text_expansion": {"results_field": self._tokens_field}
-                            },
+                            "input_output": [
+                                {
+                                    "input_field": text_field,
+                                    "output_field": f"{vector_field}.{self._tokens_field}",
+                                },
+                            ],
                         }
                     }
                 ],

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def needs_inference(self) -> bool:`
`96`	`96`
`97`	`97`
`98`	`98`	`class AsyncSparseVectorStrategy(AsyncRetrievalStrategy):`
`99`		- """Sparse retrieval strategy using the `text_expansion` processor."""
	`99`	+ """Sparse retrieval strategy using the `sparse_vector` processor."""
`100`	`100`
`101`	`101`	`def __init__(self, model_id: str = ".elser_model_2"):`
`102`	`102`	`self.model_id = model_id`
`@@ -127,11 +127,10 @@ def es_query(`
`127`	`127`	`"bool": {`
`128`	`128`	`"must": [`
`129`	`129`	`{`
`130`		`- "text_expansion": {`
`131`		`- f"{vector_field}.{self._tokens_field}": {`
`132`		`- "model_id": self.model_id,`
`133`		`- "model_text": query,`
`134`		`- }`
	`130`	`+ "sparse_vector": {`
	`131`	`+ "field": f"{vector_field}.{self._tokens_field}",`
	`132`	`+ "inference_id": self.model_id,`
	`133`	`+ "query": query,`
`135`	`134`	`}`
`136`	`135`	`}`
`137`	`136`	`],`
`@@ -150,7 +149,7 @@ def es_mappings_settings(`
`150`	`149`	`mappings: Dict[str, Any] = {`
`151`	`150`	`"properties": {`
`152`	`151`	`vector_field: {`
`153`		`- "properties": {self._tokens_field: {"type": "rank_features"}}`
	`152`	`+ "properties": {self._tokens_field: {"type": "sparse_vector"}}`
`154`	`153`	`}`
`155`	`154`	`}`
`156`	`155`	`}`
`@@ -172,11 +171,12 @@ async def before_index_creation(`
`172`	`171`	`{`
`173`	`172`	`"inference": {`
`174`	`173`	`"model_id": self.model_id,`
`175`		`- "target_field": vector_field,`
`176`		`- "field_map": {text_field: "text_field"},`
`177`		`- "inference_config": {`
`178`		`- "text_expansion": {"results_field": self._tokens_field}`
`179`		`- },`
	`174`	`+ "input_output": [`
	`175`	`+ {`
	`176`	`+ "input_field": text_field,`
	`177`	`+ "output_field": f"{vector_field}.{self._tokens_field}",`
	`178`	`+ },`
	`179`	`+ ],`
`180`	`180`	`}`
`181`	`181`	`}`
`182`	`182`	`],`