Add index creation to aggregations example (elastic#1862)

miguelgrinberg · web-flow · commit 8a110b06d304 · 2024-07-10T11:05:48.000+01:00
* Add index creation to aggregations example

* remove shard/replica restrictions from test datasets
diff --git a/examples/async/composite_agg.py b/examples/async/composite_agg.py
@@ -19,7 +19,10 @@
 import os
 from typing import Any, AsyncIterator, Dict, List, Optional, Union
 
+from elasticsearch.helpers import async_bulk
+
 from elasticsearch_dsl import A, Agg, AsyncSearch, Response, async_connections
+from tests.test_integration.test_data import DATA, GIT_INDEX
 
 
 async def scan_aggs(
@@ -56,8 +59,17 @@ async def run_search(**kwargs: Any) -> Response:
 
 async def main() -> None:
     # initiate the default connection to elasticsearch
-    async_connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
+    client = async_connections.create_connection(
+        hosts=[os.environ["ELASTICSEARCH_URL"]]
+    )
+
+    # create the index and populate it with some data
+    # note that the dataset is imported from the library's test suite
+    await client.indices.delete(index="git", ignore_unavailable=True)
+    await client.indices.create(index="git", **GIT_INDEX)
+    await async_bulk(client, DATA, raise_on_error=True, refresh=True)
 
+    # run some aggregations on the data
     async for b in scan_aggs(
         AsyncSearch(index="git"),
         {"files": A("terms", field="files")},
diff --git a/examples/composite_agg.py b/examples/composite_agg.py
@@ -18,7 +18,10 @@
 import os
 from typing import Any, Dict, Iterator, List, Optional, Union
 
+from elasticsearch.helpers import bulk
+
 from elasticsearch_dsl import A, Agg, Response, Search, connections
+from tests.test_integration.test_data import DATA, GIT_INDEX
 
 
 def scan_aggs(
@@ -55,8 +58,15 @@ def run_search(**kwargs: Any) -> Response:
 
 def main() -> None:
     # initiate the default connection to elasticsearch
-    connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
+    client = connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
+
+    # create the index and populate it with some data
+    # note that the dataset is imported from the library's test suite
+    client.indices.delete(index="git", ignore_unavailable=True)
+    client.indices.create(index="git", **GIT_INDEX)
+    bulk(client, DATA, raise_on_error=True, refresh=True)
 
+    # run some aggregations on the data
     for b in scan_aggs(
         Search(index="git"),
         {"files": A("terms", field="files")},
diff --git a/tests/test_integration/test_data.py b/tests/test_integration/test_data.py
@@ -19,99 +19,85 @@
 
 from elasticsearch import Elasticsearch
 
+user_mapping = {
+    "properties": {"name": {"type": "text", "fields": {"raw": {"type": "keyword"}}}}
+}
 
-def create_flat_git_index(client: Elasticsearch, index: str) -> None:
-    # we will use user on several places
-    user_mapping = {
-        "properties": {"name": {"type": "text", "fields": {"raw": {"type": "keyword"}}}}
-    }
-
-    client.indices.create(
-        index=index,
-        body={
-            "settings": {
-                # just one shard, no replicas for testing
-                "number_of_shards": 1,
-                "number_of_replicas": 0,
-                # custom analyzer for analyzing file paths
-                "analysis": {
-                    "analyzer": {
-                        "file_path": {
-                            "type": "custom",
-                            "tokenizer": "path_hierarchy",
-                            "filter": ["lowercase"],
-                        }
-                    }
-                },
-            },
-            "mappings": {
-                "properties": {
-                    "description": {"type": "text", "analyzer": "snowball"},
-                    "author": user_mapping,
-                    "authored_date": {"type": "date"},
-                    "committer": user_mapping,
-                    "committed_date": {"type": "date"},
-                    "parent_shas": {"type": "keyword"},
-                    "files": {
-                        "type": "text",
-                        "analyzer": "file_path",
-                        "fielddata": True,
-                    },
+FLAT_GIT_INDEX: Dict[str, Any] = {
+    "settings": {
+        # custom analyzer for analyzing file paths
+        "analysis": {
+            "analyzer": {
+                "file_path": {
+                    "type": "custom",
+                    "tokenizer": "path_hierarchy",
+                    "filter": ["lowercase"],
                 }
+            }
+        },
+    },
+    "mappings": {
+        "properties": {
+            "description": {"type": "text", "analyzer": "snowball"},
+            "author": user_mapping,
+            "authored_date": {"type": "date"},
+            "committer": user_mapping,
+            "committed_date": {"type": "date"},
+            "parent_shas": {"type": "keyword"},
+            "files": {
+                "type": "text",
+                "analyzer": "file_path",
+                "fielddata": True,
             },
+        }
+    },
+}
+
+GIT_INDEX: Dict[str, Any] = {
+    "settings": {
+        # custom analyzer for analyzing file paths
+        "analysis": {
+            "analyzer": {
+                "file_path": {
+                    "type": "custom",
+                    "tokenizer": "path_hierarchy",
+                    "filter": ["lowercase"],
+                }
+            }
         },
-    )
+    },
+    "mappings": {
+        "properties": {
+            # common fields
+            "description": {"type": "text", "analyzer": "snowball"},
+            "commit_repo": {"type": "join", "relations": {"repo": "commit"}},
+            # COMMIT mappings
+            "author": user_mapping,
+            "authored_date": {"type": "date"},
+            "committer": user_mapping,
+            "committed_date": {"type": "date"},
+            "parent_shas": {"type": "keyword"},
+            "files": {
+                "type": "text",
+                "analyzer": "file_path",
+                "fielddata": True,
+            },
+            # REPO mappings
+            "is_public": {"type": "boolean"},
+            "owner": user_mapping,
+            "created_at": {"type": "date"},
+            "tags": {"type": "keyword"},
+        }
+    },
+}
 
 
-def create_git_index(client: Elasticsearch, index: str) -> None:
-    # we will use user on several places
-    user_mapping = {
-        "properties": {"name": {"type": "text", "fields": {"raw": {"type": "keyword"}}}}
-    }
+def create_flat_git_index(client: Elasticsearch, index: str) -> None:
+    client.indices.create(index=index, body=FLAT_GIT_INDEX)
 
-    client.indices.create(
-        index=index,
-        body={
-            "settings": {
-                # just one shard, no replicas for testing
-                "number_of_shards": 1,
-                "number_of_replicas": 0,
-                # custom analyzer for analyzing file paths
-                "analysis": {
-                    "analyzer": {
-                        "file_path": {
-                            "type": "custom",
-                            "tokenizer": "path_hierarchy",
-                            "filter": ["lowercase"],
-                        }
-                    }
-                },
-            },
-            "mappings": {
-                "properties": {
-                    # common fields
-                    "description": {"type": "text", "analyzer": "snowball"},
-                    "commit_repo": {"type": "join", "relations": {"repo": "commit"}},
-                    # COMMIT mappings
-                    "author": user_mapping,
-                    "authored_date": {"type": "date"},
-                    "committer": user_mapping,
-                    "committed_date": {"type": "date"},
-                    "parent_shas": {"type": "keyword"},
-                    "files": {
-                        "type": "text",
-                        "analyzer": "file_path",
-                        "fielddata": True,
-                    },
-                    # REPO mappings
-                    "is_public": {"type": "boolean"},
-                    "owner": user_mapping,
-                    "created_at": {"type": "date"},
-                    "tags": {"type": "keyword"},
-                }
-            },
-        },
-    )
+
+def create_git_index(client: Elasticsearch, index: str) -> None:
+    client.indices.create(index=index, body=GIT_INDEX)
 
 
 DATA = [
diff --git a/utils/run-unasync.py b/utils/run-unasync.py
@@ -64,6 +64,7 @@ def main(check=False):
         "async_connections": "connections",
         "async_scan": "scan",
         "async_simulate": "simulate",
+        "async_bulk": "bulk",
         "async_mock_client": "mock_client",
         "async_client": "client",
         "async_data_client": "data_client",