Skip to content

Commit

Permalink
Merge pull request #39 from russellb/chunking-tests
Browse files Browse the repository at this point in the history
Move unit tests from instructlab repo
  • Loading branch information
russellb authored Jun 25, 2024
2 parents d395b77 + 3216f2d commit b81f3e3
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 1 deletion.
19 changes: 19 additions & 0 deletions .github/actions/free-disk-space/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: 'Free Disk Space'
description: 'Frees disk space on the runner'
runs:
using: "composite"
steps:
- run: |
df -h
sudo docker rmi "$(docker image ls -aq)" >/dev/null 2>&1 || true
sudo rm -rf \
/usr/share/dotnet /usr/local/lib/android /opt/ghc \
/usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \
/usr/lib/jvm || true
sudo apt install aptitude -y >/dev/null 2>&1
sudo aptitude purge '~n ^mysql' -f -y >/dev/null 2>&1
sudo aptitude purge '~n ^dotnet' -f -y >/dev/null 2>&1
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get autoclean -y >/dev/null 2>&1
df -h
shell: bash
117 changes: 117 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# SPDX-License-Identifier: Apache-2.0

name: Test

on:
workflow_dispatch:
push:
branches:
- "main"
- "release-**"
paths:
- '**.py'
- 'pyproject.toml'
- 'requirements*.txt'
- 'tox.ini'
- '.github/workflows/test.yml' # This workflow
pull_request:
branches:
- "main"
- "release-**"
paths:
- '**.py'
- 'pyproject.toml'
- 'requirements*.txt'
- 'tox.ini'
- '.github/workflows/test.yml' # This workflow

env:
LC_ALL: en_US.UTF-8

defaults:
run:
shell: bash

permissions:
contents: read

jobs:
test:
name: "${{ matrix.python }} on ${{ matrix.platform }}"
runs-on: "${{ matrix.platform }}"
strategy:
matrix:
python:
- "3.10"
- "3.11"
platform:
- "ubuntu-latest"
include:
- python: "3.11"
platform: "macos-latest"
steps:
- name: "Harden Runner"
uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1
with:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Free disk space
if: matrix.platform != 'macos-latest'
uses: ./.github/actions/free-disk-space

- name: Install the expect package
if: startsWith(matrix.platform, 'ubuntu')
run: |
sudo apt-get install -y expect
- name: Install tools on MacOS
if: startsWith(matrix.platform, 'macos')
run: |
brew install expect coreutils bash
- name: Setup Python ${{ matrix.python }}
uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
with:
python-version: ${{ matrix.python }}
cache: pip
cache-dependency-path: |
**/pyproject.toml
**/requirements*.txt
- name: Remove llama-cpp-python from cache
run: |
pip cache remove llama_cpp_python
- name: Cache huggingface
uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
with:
path: ~/.cache/huggingface
# config contains DEFAULT_MODEL
key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install tox tox-gh>=1.2
- name: Run unit tests with tox
run: |
tox
- name: Remove llama-cpp-python from cache
if: always()
run: |
pip cache remove llama_cpp_python
test-workflow-complete:
needs: ["test"]
runs-on: ubuntu-latest
steps:
- name: Test Workflow Complete
run: echo "Test Workflow Complete"
4 changes: 4 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ instructlab>=0.17.0
pre-commit>=3.0.4,<4.0
pylint>=2.16.2,<4.0
pylint-pydantic
pytest
pytest-asyncio
pytest-cov
pytest-html
tox>=4.4.2,<5
Empty file added tests/__init__.py
Empty file.
57 changes: 57 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from unittest.mock import Mock, patch

# Third Party
import git
import pytest
import yaml

# First Party
from instructlab.sdg import utils

# Local
from .testdata import testdata


class TestUtils:
"""Test collection in instructlab.utils."""

def test_chunk_docs_wc_exeeds_ctx_window(self):
with pytest.raises(ValueError) as exc:
utils.chunk_document(
documents=testdata.documents,
chunk_word_count=1000,
server_ctx_size=1034,
)
assert (
"Given word count (1000) per doc will exceed the server context window size (1034)"
in str(exc.value)
)

def test_chunk_docs_chunk_overlap_error(self):
with pytest.raises(ValueError) as exc:
utils.chunk_document(
documents=testdata.documents,
chunk_word_count=5,
server_ctx_size=1034,
)
assert (
"Got a larger chunk overlap (100) than chunk size (24), should be smaller"
in str(exc.value)
)

def test_chunk_docs_long_lines(self):
chunk_words = 50
chunks = utils.chunk_document(
documents=testdata.long_line_documents,
chunk_word_count=chunk_words,
server_ctx_size=4096,
)
max_tokens = utils.num_tokens_from_words(chunk_words)
max_chars = utils.num_chars_from_tokens(max_tokens)
max_chars += utils.DEFAULT_CHUNK_OVERLAP # add in the chunk overlap
max_chars += 50 # and a bit extra for some really long words
for chunk in chunks:
assert len(chunk) <= max_chars
25 changes: 25 additions & 0 deletions tests/testdata/testdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0

documents = [
"""Knowledge is an awareness of facts, a familiarity with individuals and situations,
or a practical skill. Knowledge of facts, also called propositional knowledge, is often characterized
as true belief that is distinct from opinion or guesswork by virtue of justification.
While there is wide agreement among philosophers that propositional knowledge is a form of true belief,
many controversies focus on justification. This includes questions like how to understand justification,
whether it is needed at all, and whether something else besides it is needed.
These controversies intensified in the latter half of the 20th century due to a series of thought experiments
called Gettier cases that provoked alternative definitions."""
]

long_line_documents = [
(
"Knowledge is an awareness of facts, a familiarity with individuals and situations,"
"or a practical skill. Knowledge of facts, also called propositional knowledge, is often characterized "
"as true belief that is distinct from opinion or guesswork by virtue of justification. "
"While there is wide agreement among philosophers that propositional knowledge is a form of true belief, "
"many controversies focus on justification. This includes questions like how to understand justification, "
"whether it is needed at all, and whether something else besides it is needed. "
"These controversies intensified in the latter half of the 20th century due to a series of thought experiments "
"called Gettier cases that provoked alternative definitions."
)
]
20 changes: 19 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,22 @@
[tox]
# py3-unit runs unit tests with 'python3'
# py311-unit runs the same tests with 'python3.11'
envlist = ruff, lint, mypy, spellcheck
envlist = ruff, lint, mypy, spellcheck, py3-unit
minversion = 4.4

[testenv]
description = run tests (unit, unitcov)
# Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
# are huge. This reduces venv from 5.7 GB to 1.5 GB.
setenv =
PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
package = wheel
wheel_build_env = pkg
deps = -r requirements-dev.txt
commands =
unit: {envpython} -m pytest {posargs:tests}
unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}

# format, check, and linting targets don't build and install the project to
# speed up testing.
[testenv:lint]
Expand Down Expand Up @@ -59,3 +72,8 @@ deps =
pytest
commands =
mypy src

[gh]
python =
3.11 = py311-unitcov
3.10 = py310-unitcov

0 comments on commit b81f3e3

Please sign in to comment.