Merge pull request microsoft#14 from KylinMountain/cutomize

Merge official repo: 0.3.0
congbo · Aug 13, 2024 · 44be86f · 44be86f
2 parents 9109de3 + f565b58
commit 44be86f
Show file tree

Hide file tree

Showing 22 changed files with 778 additions and 573 deletions.
diff --git a/.semversioner/0.3.0.json b/.semversioner/0.3.0.json
@@ -0,0 +1,30 @@
+{
+  "changes": [
+    {
+      "description": "Implement auto templating API.",
+      "type": "minor"
+    },
+    {
+      "description": "Implement query engine API.",
+      "type": "minor"
+    },
+    {
+      "description": "Fix file dumps using json for non ASCII chars",
+      "type": "patch"
+    },
+    {
+      "description": "Stabilize smoke tests for query context building",
+      "type": "patch"
+    },
+    {
+      "description": "fix query embedding",
+      "type": "patch"
+    },
+    {
+      "description": "fix sort_context & max_tokens params in verb",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-08-12T23:51:49+00:00",
+  "version": "0.3.0"
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,12 +1,21 @@
 # Changelog
-
 Note: version releases in the 0.x.y range may introduce breaking changes.
 
+## 0.3.0
+
+- minor: Implement auto templating API.
+- minor: Implement query engine API.
+- patch: Fix file dumps using json for non ASCII chars
+- patch: Stabilize smoke tests for query context building
+- patch: fix query embedding
+- patch: fix sort_context & max_tokens params in verb
+
 ## 0.2.2
 
 - patch: Add a check if there is no community record added in local search context
 - patch: Add sepparate workflow for Python Tests
 - patch: Docs updates
+- patch: Run smoke tests on 4o
 
 ## 0.2.1
 

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -2,5 +2,4 @@
 # the repo. Unless a later match takes precedence,
 # @global-owner1 and @global-owner2 will be requested for
 # review when someone opens a pull request.
-*       @microsoft/societal-resilience
-*       @microsoft/graphrag-core-team
+*       @microsoft/societal-resilience @microsoft/graphrag-core-team
diff --git a/graphrag/index/graph/extractors/community_reports/sort_context.py b/graphrag/index/graph/extractors/community_reports/sort_context.py
@@ -144,7 +144,7 @@ def _get_context_string(
             new_context_string = _get_context_string(
                 sorted_nodes, sorted_edges, sorted_claims, sub_community_reports
             )
-            if num_tokens(context_string) > max_tokens:
+            if num_tokens(new_context_string) > max_tokens:
                 break
             context_string = new_context_string
 

diff --git a/graphrag/index/graph/extractors/summarize/prompts.py b/graphrag/index/graph/extractors/summarize/prompts.py
@@ -8,7 +8,7 @@
 Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
 Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
 If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
-Make sure it is written in third person, and include the entity names so we the have full context.
+Make sure it is written in third person, and include the entity names so we have the full context.
 
 #######
 -Data-

diff --git a/graphrag/index/workflows/v1/create_final_community_reports.py b/graphrag/index/workflows/v1/create_final_community_reports.py
@@ -19,6 +19,10 @@ def build_steps(
     """
     covariates_enabled = config.get("covariates_enabled", False)
     create_community_reports_config = config.get("create_community_reports", {})
+    community_report_strategy = create_community_reports_config.get("strategy", {})
+    community_report_max_input_length = community_report_strategy.get(
+        "max_input_length", 16_000
+    )
     base_text_embed = config.get("text_embed", {})
     community_report_full_content_embed_config = config.get(
         "community_report_full_content_embed", base_text_embed
@@ -77,6 +81,7 @@ def build_steps(
         {
             "id": "local_contexts",
             "verb": "prepare_community_reports",
+            "args": {"max_tokens": community_report_max_input_length},
             "input": {
                 "source": "nodes",
                 "nodes": "nodes",

diff --git a/graphrag/prompt_tune/__main__.py b/graphrag/prompt_tune/__main__.py
@@ -1,53 +1,48 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License
 
-"""The Prompt auto templating package root."""
+"""The auto templating package root."""
 
 import argparse
 import asyncio
-from enum import Enum
-
-from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
-from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
 
+from .api import DocSelectionType
 from .cli import prompt_tune
-
-
-class DocSelectionType(Enum):
-    """The type of document selection to use."""
-
-    ALL = "all"
-    RANDOM = "random"
-    TOP = "top"
-    AUTO = "auto"
-
-    def __str__(self):
-        """Return the string representation of the enum value."""
-        return self.value
-
+from .generator import MAX_TOKEN_COUNT
+from .loader import MIN_CHUNK_SIZE
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        prog="python -m graphrag.prompt_tune",
+        description="The graphrag auto templating module.",
+    )
+
+    parser.add_argument(
+        "--config",
+        help="Configuration yaml file to use when generating prompts",
+        required=True,
+        type=str,
+    )
 
     parser.add_argument(
         "--root",
-        help="The data project root. Including the config yml, json or .env",
+        help="Data project root. Default: current directory",
         required=False,
         type=str,
         default=".",
     )
 
     parser.add_argument(
         "--domain",
-        help="The domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If left empty, the domain will be inferred from the input data.",
+        help="Domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If not defined, the domain will be inferred from the input data.",
         required=False,
         default="",
         type=str,
     )
 
     parser.add_argument(
-        "--method",
-        help="The method to select documents, one of: all, random, top or auto",
+        "--selection-method",
+        help=f"Chunk selection method. Default: {DocSelectionType.RANDOM}",
         required=False,
         type=DocSelectionType,
         choices=list(DocSelectionType),
@@ -56,47 +51,47 @@ def __str__(self):
 
     parser.add_argument(
         "--n_subset_max",
-        help="The number of text chunks to embed when using auto selection method",
+        help="Number of text chunks to embed when using auto selection method. Default: 300",
         required=False,
         type=int,
         default=300,
     )
 
     parser.add_argument(
         "--k",
-        help="The maximum number of documents to select from each centroid when using auto selection method",
+        help="Maximum number of documents to select from each centroid when using auto selection method. Default: 15",
         required=False,
         type=int,
         default=15,
     )
 
     parser.add_argument(
         "--limit",
-        help="The limit of files to load when doing random or top selection",
+        help="Number of documents to load when doing random or top selection. Default: 15",
         type=int,
         required=False,
         default=15,
     )
 
     parser.add_argument(
         "--max-tokens",
-        help="Max token count for prompt generation",
+        help=f"Max token count for prompt generation. Default: {MAX_TOKEN_COUNT}",
         type=int,
         required=False,
         default=MAX_TOKEN_COUNT,
     )
 
     parser.add_argument(
         "--min-examples-required",
-        help="The minimum number of examples required in entity extraction prompt",
+        help="Minimum number of examples required in the entity extraction prompt. Default: 2",
         type=int,
         required=False,
         default=2,
     )
 
     parser.add_argument(
         "--chunk-size",
-        help="Max token count for prompt generation",
+        help=f"Max token count for prompt generation. Default: {MIN_CHUNK_SIZE}",
         type=int,
         required=False,
         default=MIN_CHUNK_SIZE,
@@ -120,7 +115,7 @@ def __str__(self):
 
     parser.add_argument(
         "--output",
-        help="Folder to save the generated prompts to",
+        help="Directory to save generated prompts to. Default: 'prompts'",
         type=str,
         required=False,
         default="prompts",
@@ -132,17 +127,18 @@ def __str__(self):
 
     loop.run_until_complete(
         prompt_tune(
-            args.root,
-            args.domain,
-            str(args.method),
-            args.limit,
-            args.max_tokens,
-            args.chunk_size,
-            args.language,
-            args.no_entity_types,
-            args.output,
-            args.n_subset_max,
-            args.k,
-            args.min_examples_required,
+            config=args.config,
+            root=args.root,
+            domain=args.domain,
+            selection_method=args.selection_method,
+            limit=args.limit,
+            max_tokens=args.max_tokens,
+            chunk_size=args.chunk_size,
+            language=args.language,
+            skip_entity_types=args.no_entity_types,
+            output=args.output,
+            n_subset_max=args.n_subset_max,
+            k=args.k,
+            min_examples_required=args.min_examples_required,
         )
     )