Skip to content

Commit

Permalink
Feat/update cli (#1376)
Browse files Browse the repository at this point in the history
* Add update cli option with default storage

* Semver

* Semver

* Pyright

* Format
  • Loading branch information
AlonsoGuevara authored Nov 7, 2024
1 parent baa261c commit 20c1202
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20241107010037320137.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add update cli entrypoint for incremental indexing"
}
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"**/.yarn": true,
"**/.pnp.*": true
},
"editor.formatOnSave": false,
"eslint.nodePath": ".yarn/sdks",
"typescript.tsdk": ".yarn/sdks/typescript/lib",
"typescript.enablePromptUseWorkspaceTsdk": true,
Expand Down
67 changes: 66 additions & 1 deletion graphrag/cli/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,76 @@ def index_cli(
output_dir: Path | None,
):
"""Run the pipeline with the given config."""
config = load_config(root_dir, config_filepath)

_run_index(
config=config,
verbose=verbose,
resume=resume,
memprofile=memprofile,
cache=cache,
reporter=reporter,
emit=emit,
dry_run=dry_run,
skip_validation=skip_validation,
output_dir=output_dir,
)


def update_cli(
root_dir: Path,
verbose: bool,
memprofile: bool,
cache: bool,
reporter: ReporterType,
config_filepath: Path | None,
emit: list[TableEmitterType],
skip_validation: bool,
output_dir: Path | None,
):
"""Run the pipeline with the given config."""
config = load_config(root_dir, config_filepath)

# Check if update storage exist, if not configure it with default values
if not config.update_index_storage:
from graphrag.config.defaults import STORAGE_TYPE, UPDATE_STORAGE_BASE_DIR
from graphrag.config.models.storage_config import StorageConfig

config.update_index_storage = StorageConfig(
type=STORAGE_TYPE,
base_dir=UPDATE_STORAGE_BASE_DIR,
)

_run_index(
config=config,
verbose=verbose,
resume=False,
memprofile=memprofile,
cache=cache,
reporter=reporter,
emit=emit,
dry_run=False,
skip_validation=skip_validation,
output_dir=output_dir,
)


def _run_index(
config,
verbose,
resume,
memprofile,
cache,
reporter,
emit,
dry_run,
skip_validation,
output_dir,
):
progress_reporter = create_progress_reporter(reporter)
info, error, success = _logger(progress_reporter)
run_id = resume or time.strftime("%Y%m%d-%H%M%S")

config = load_config(root_dir, config_filepath)
config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
config.reporting.base_dir = (
str(output_dir) if output_dir else config.reporting.base_dir
Expand Down
67 changes: 66 additions & 1 deletion graphrag/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE

from .index import index_cli
from .index import index_cli, update_cli
from .initialize import initialize_project_at
from .prompt_tune import prompt_tune
from .query import run_drift_search, run_global_search, run_local_search
Expand Down Expand Up @@ -129,6 +129,71 @@ def _index_cli(
)


@app.command("update")
def _update_cli(
config: Annotated[
Path | None,
typer.Option(
help="The configuration to use.", exists=True, file_okay=True, readable=True
),
] = None,
root: Annotated[
Path,
typer.Option(
help="The project root directory.",
exists=True,
dir_okay=True,
writable=True,
resolve_path=True,
),
] = Path(), # set default to current directory
verbose: Annotated[
bool, typer.Option(help="Run the indexing pipeline with verbose logging")
] = False,
memprofile: Annotated[
bool, typer.Option(help="Run the indexing pipeline with memory profiling")
] = False,
reporter: Annotated[
ReporterType, typer.Option(help="The progress reporter to use.")
] = ReporterType.RICH,
emit: Annotated[
str, typer.Option(help="The data formats to emit, comma-separated.")
] = TableEmitterType.Parquet.value,
cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True,
skip_validation: Annotated[
bool,
typer.Option(
help="Skip any preflight validation. Useful when running no LLM steps."
),
] = False,
output: Annotated[
Path | None,
typer.Option(
help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.",
dir_okay=True,
writable=True,
resolve_path=True,
),
] = None,
):
"""
Update an existing knowledge graph index.
Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder.
"""
update_cli(
root_dir=root,
verbose=verbose,
memprofile=memprofile,
cache=cache,
reporter=ReporterType(reporter),
config_filepath=config,
emit=[TableEmitterType(value.strip()) for value in emit.split(",")],
skip_validation=skip_validation,
output_dir=output,
)


@app.command("prompt-tune")
def _prompt_tune_cli(
root: Annotated[
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ test_smoke = "pytest ./tests/smoke"
test_notebook = "pytest ./tests/notebook"
test_verbs = "pytest ./tests/verbs"
index = "python -m graphrag index"
update = "python -m graphrag update"
init = "python -m graphrag init"
query = "python -m graphrag query"
prompt_tune = "python -m graphrag prompt-tune"
Expand Down

0 comments on commit 20c1202

Please sign in to comment.