Skip to content

Commit

Permalink
Merge pull request microsoft#14 from KylinMountain/cutomize
Browse files Browse the repository at this point in the history
Merge official repo: 0.3.0
  • Loading branch information
KylinMountain authored Aug 13, 2024
2 parents 9109de3 + f565b58 commit 44be86f
Show file tree
Hide file tree
Showing 22 changed files with 778 additions and 573 deletions.
30 changes: 30 additions & 0 deletions .semversioner/0.3.0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"changes": [
{
"description": "Implement auto templating API.",
"type": "minor"
},
{
"description": "Implement query engine API.",
"type": "minor"
},
{
"description": "Fix file dumps using json for non ASCII chars",
"type": "patch"
},
{
"description": "Stabilize smoke tests for query context building",
"type": "patch"
},
{
"description": "fix query embedding",
"type": "patch"
},
{
"description": "fix sort_context & max_tokens params in verb",
"type": "patch"
}
],
"created_at": "2024-08-12T23:51:49+00:00",
"version": "0.3.0"
}
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
# Changelog

Note: version releases in the 0.x.y range may introduce breaking changes.

## 0.3.0

- minor: Implement auto templating API.
- minor: Implement query engine API.
- patch: Fix file dumps using json for non ASCII chars
- patch: Stabilize smoke tests for query context building
- patch: fix query embedding
- patch: fix sort_context & max_tokens params in verb

## 0.2.2

- patch: Add a check if there is no community record added in local search context
- patch: Add sepparate workflow for Python Tests
- patch: Docs updates
- patch: Run smoke tests on 4o

## 0.2.1

Expand Down
3 changes: 1 addition & 2 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
# the repo. Unless a later match takes precedence,
# @global-owner1 and @global-owner2 will be requested for
# review when someone opens a pull request.
* @microsoft/societal-resilience
* @microsoft/graphrag-core-team
* @microsoft/societal-resilience @microsoft/graphrag-core-team
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _get_context_string(
new_context_string = _get_context_string(
sorted_nodes, sorted_edges, sorted_claims, sub_community_reports
)
if num_tokens(context_string) > max_tokens:
if num_tokens(new_context_string) > max_tokens:
break
context_string = new_context_string

Expand Down
2 changes: 1 addition & 1 deletion graphrag/index/graph/extractors/summarize/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
Make sure it is written in third person, and include the entity names so we the have full context.
Make sure it is written in third person, and include the entity names so we have the full context.
#######
-Data-
Expand Down
5 changes: 5 additions & 0 deletions graphrag/index/workflows/v1/create_final_community_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ def build_steps(
"""
covariates_enabled = config.get("covariates_enabled", False)
create_community_reports_config = config.get("create_community_reports", {})
community_report_strategy = create_community_reports_config.get("strategy", {})
community_report_max_input_length = community_report_strategy.get(
"max_input_length", 16_000
)
base_text_embed = config.get("text_embed", {})
community_report_full_content_embed_config = config.get(
"community_report_full_content_embed", base_text_embed
Expand Down Expand Up @@ -77,6 +81,7 @@ def build_steps(
{
"id": "local_contexts",
"verb": "prepare_community_reports",
"args": {"max_tokens": community_report_max_input_length},
"input": {
"source": "nodes",
"nodes": "nodes",
Expand Down
82 changes: 39 additions & 43 deletions graphrag/prompt_tune/__main__.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,48 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""The Prompt auto templating package root."""
"""The auto templating package root."""

import argparse
import asyncio
from enum import Enum

from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE

from .api import DocSelectionType
from .cli import prompt_tune


class DocSelectionType(Enum):
"""The type of document selection to use."""

ALL = "all"
RANDOM = "random"
TOP = "top"
AUTO = "auto"

def __str__(self):
"""Return the string representation of the enum value."""
return self.value

from .generator import MAX_TOKEN_COUNT
from .loader import MIN_CHUNK_SIZE

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(
prog="python -m graphrag.prompt_tune",
description="The graphrag auto templating module.",
)

parser.add_argument(
"--config",
help="Configuration yaml file to use when generating prompts",
required=True,
type=str,
)

parser.add_argument(
"--root",
help="The data project root. Including the config yml, json or .env",
help="Data project root. Default: current directory",
required=False,
type=str,
default=".",
)

parser.add_argument(
"--domain",
help="The domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If left empty, the domain will be inferred from the input data.",
help="Domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If not defined, the domain will be inferred from the input data.",
required=False,
default="",
type=str,
)

parser.add_argument(
"--method",
help="The method to select documents, one of: all, random, top or auto",
"--selection-method",
help=f"Chunk selection method. Default: {DocSelectionType.RANDOM}",
required=False,
type=DocSelectionType,
choices=list(DocSelectionType),
Expand All @@ -56,47 +51,47 @@ def __str__(self):

parser.add_argument(
"--n_subset_max",
help="The number of text chunks to embed when using auto selection method",
help="Number of text chunks to embed when using auto selection method. Default: 300",
required=False,
type=int,
default=300,
)

parser.add_argument(
"--k",
help="The maximum number of documents to select from each centroid when using auto selection method",
help="Maximum number of documents to select from each centroid when using auto selection method. Default: 15",
required=False,
type=int,
default=15,
)

parser.add_argument(
"--limit",
help="The limit of files to load when doing random or top selection",
help="Number of documents to load when doing random or top selection. Default: 15",
type=int,
required=False,
default=15,
)

parser.add_argument(
"--max-tokens",
help="Max token count for prompt generation",
help=f"Max token count for prompt generation. Default: {MAX_TOKEN_COUNT}",
type=int,
required=False,
default=MAX_TOKEN_COUNT,
)

parser.add_argument(
"--min-examples-required",
help="The minimum number of examples required in entity extraction prompt",
help="Minimum number of examples required in the entity extraction prompt. Default: 2",
type=int,
required=False,
default=2,
)

parser.add_argument(
"--chunk-size",
help="Max token count for prompt generation",
help=f"Max token count for prompt generation. Default: {MIN_CHUNK_SIZE}",
type=int,
required=False,
default=MIN_CHUNK_SIZE,
Expand All @@ -120,7 +115,7 @@ def __str__(self):

parser.add_argument(
"--output",
help="Folder to save the generated prompts to",
help="Directory to save generated prompts to. Default: 'prompts'",
type=str,
required=False,
default="prompts",
Expand All @@ -132,17 +127,18 @@ def __str__(self):

loop.run_until_complete(
prompt_tune(
args.root,
args.domain,
str(args.method),
args.limit,
args.max_tokens,
args.chunk_size,
args.language,
args.no_entity_types,
args.output,
args.n_subset_max,
args.k,
args.min_examples_required,
config=args.config,
root=args.root,
domain=args.domain,
selection_method=args.selection_method,
limit=args.limit,
max_tokens=args.max_tokens,
chunk_size=args.chunk_size,
language=args.language,
skip_entity_types=args.no_entity_types,
output=args.output,
n_subset_max=args.n_subset_max,
k=args.k,
min_examples_required=args.min_examples_required,
)
)
Loading

0 comments on commit 44be86f

Please sign in to comment.