Merge pull request #12 from SomaLogic/master

Release of 0.4.2
SomaLogic · Nov 7, 2023 · a7072ec · a7072ec
2 parents 3894833 + beea766
commit a7072ec
Show file tree

Hide file tree

Showing 26 changed files with 1,302 additions and 979 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,35 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  pull_request:
+    branches: [ "master", "release" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-latest]
+        python-version: ["3.8", "3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest
+        pip install .
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+        args:
+            - --unsafe
+    -   id: check-json
+    -   id: check-added-large-files
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+        args: ["--profile", "black", "--filter-files", "--skip=__init__.py"]
+-   repo: https://github.com/ambv/black
+    rev: 23.10.1
+    hooks:
+    -   id: black
+        args: ['-S']
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+    -   id: codespell
+        exclude: README.ipynb
diff --git a/LICENSE b/LICENSE
@@ -2,7 +2,7 @@
 
 Canopy™
 
-Copyright © 2022 SomaLogic Operating Company, Inc.
+Copyright © 2023 SomaLogic Operating Company, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of the Canopy software
 and associated documentation files (the "Software"), to deal in the Software without restriction,

diff --git a/README.ipynb b/README.ipynb
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ pip install -e ./canopy
 
 ### Dependencies
 
-`Python >3.8` is required to install `canopy`. The following package dependencies are installed on a `pip install`:
+`Python >=3.8` is required to install `canopy`. The following package dependencies are installed on a `pip install`:
   - `pandas >= 1.1.0`
   - `numpy >= 1.19.1`
 
@@ -57,23 +57,23 @@ help(canopy)
 *Output*
 ``` python
 #> Help on package canopy:
-#> 
+#>
 #> NAME
 #>     canopy
-#> 
+#>
 #> PACKAGE CONTENTS
 #>     adat
 #>     base (package)
 #>     data (package)
 #>     errors
 #>     io (package)
 #>     tools (package)
-#> 
+#>
 #> FILE
 #>     /Users/jallison/Code/canopy/canopy/__init__.py
-#> 
-#> 
-#> 
+#>
+#>
+#>
 ```
 
 ### Internal Objects
@@ -2007,7 +2007,7 @@ They are not intended to be a definitive guide in statistical
 analysis and existing packages do exist in the `Python` universe that perform parts
 or extensions of these techniques. Many variations of the workflows below
 exist, however the framework highlights how one could perform standard
-preliminary analyses on SomaLogic data for: 
+preliminary analyses on SomaLogic data for:
  - Two-group differential expression (t-test)
  - Binary classification (logistic regression)
  - Linear regression
@@ -2100,7 +2100,7 @@ print(clean_data.index.to_frame()['Group'].value_counts())
 #> 1    85
 #> 0    85
 #> Name: Group, dtype: int64
-#> 
+#>
 ```
 
 ### Split the adat based on `Group` and perform t-test across all aptamers
@@ -2482,13 +2482,13 @@ logr_res.summary()
 <table class="simpletable">
 <caption>Generalized Linear Model Regression Results</caption>
 <tr>
-  <th>Dep. Variable:</th>           <td>y</td>        <th>  No. Observations:  </th>  <td>   145</td> 
+  <th>Dep. Variable:</th>           <td>y</td>        <th>  No. Observations:  </th>  <td>   145</td>
 </tr>
 <tr>
-  <th>Model:</th>                  <td>GLM</td>       <th>  Df Residuals:      </th>  <td>   139</td> 
+  <th>Model:</th>                  <td>GLM</td>       <th>  Df Residuals:      </th>  <td>   139</td>
 </tr>
 <tr>
-  <th>Model Family:</th>        <td>Binomial</td>     <th>  Df Model:          </th>  <td>     5</td> 
+  <th>Model Family:</th>        <td>Binomial</td>     <th>  Df Model:          </th>  <td>     5</td>
 </tr>
 <tr>
   <th>Link Function:</th>         <td>logit</td>      <th>  Scale:             </th> <td>  1.0000</td>
@@ -2500,18 +2500,18 @@ logr_res.summary()
   <th>Date:</th>            <td>Fri, 25 Sep 2020</td> <th>  Deviance:          </th> <td>  16.833</td>
 </tr>
 <tr>
-  <th>Time:</th>                <td>15:33:26</td>     <th>  Pearson chi2:      </th>  <td>  17.8</td> 
+  <th>Time:</th>                <td>15:33:26</td>     <th>  Pearson chi2:      </th>  <td>  17.8</td>
 </tr>
 <tr>
-  <th>No. Iterations:</th>         <td>10</td>        <th>                     </th>     <td> </td>   
+  <th>No. Iterations:</th>         <td>10</td>        <th>                     </th>     <td> </td>
 </tr>
 <tr>
-  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>                     </th>     <td> </td>   
+  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>                     </th>     <td> </td>
 </tr>
 </table>
 <table class="simpletable">
 <tr>
-                                          <td></td>                                            <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  
+                                          <td></td>                                            <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>
 </tr>
 <tr>
   <th>const</th>                                                                            <td>    1.6106</td> <td>    1.178</td> <td>    1.367</td> <td> 0.172</td> <td>   -0.699</td> <td>    3.920</td>
@@ -2924,15 +2924,15 @@ mod.summary()
   <th>Df Residuals:</th>          <td>   136</td>      <th>  BIC:               </th> <td>   1089.</td>
 </tr>
 <tr>
-  <th>Df Model:</th>              <td>     8</td>      <th>                     </th>     <td> </td>   
+  <th>Df Model:</th>              <td>     8</td>      <th>                     </th>     <td> </td>
 </tr>
 <tr>
-  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   
+  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>
 </tr>
 </table>
 <table class="simpletable">
 <tr>
-                                      <td></td>                                         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  
+                                      <td></td>                                         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>
 </tr>
 <tr>
   <th>const</th>                                                                     <td>   55.5436</td> <td>    0.765</td> <td>   72.602</td> <td> 0.000</td> <td>   54.031</td> <td>   57.057</td>

diff --git a/canopy/__init__.py b/canopy/__init__.py
@@ -1,4 +1,5 @@
 from canopy.adat import Adat
+from canopy.annotations import Annotations
 from canopy.io.adat.file import read_file, read_adat
+from canopy.io.annotations.file import read_annotations
 from canopy.tools.adat_concatenation import concatenate_adats, smart_adat_concatenation
-from canopy.annotations import Annotations
diff --git a/canopy/errors.py b/canopy/errors.py
@@ -9,5 +9,6 @@ class AdatKeyError(Exception):
 class AdatMetaError(Exception):
     pass
 
+
 class AnnotationsLiftingError(Exception):
     pass
diff --git a/canopy/io/adat/file.py b/canopy/io/adat/file.py
@@ -1,17 +1,23 @@
 from __future__ import annotations
-from typing import TextIO, Dict, List, Tuple, Union
-from canopy import Adat
-from canopy.tools.math import jround
-from canopy.io.adat.errors import AdatReadError
+
 import csv
 import json
-import pkg_resources
-import warnings
-import re
 import logging
+import re
+import warnings
+from importlib.metadata import version
+from typing import Dict, List, TextIO, Tuple, Union
+
+from canopy import Adat
+from canopy.io.adat.errors import AdatReadError
+from canopy.tools.math import jround
 
 
-def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]]:
+def parse_file(
+    f: TextIO,
+) -> Tuple[
+    List[List[float]], Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]
+]:
     """Returns component pieces of an adat given an adat file object.
 
     Parameters
@@ -29,7 +35,7 @@ def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict
         pairs are column-name and an array of each sample's corresponding metadata
 
     column_metadata : Dict[str, List[str]]
-        A dictionary of each row of the adat column metdata where the key-value pairs are
+        A dictionary of each row of the adat column metadata where the key-value pairs are
         row-name and an array of each somamer's corresponding metadata.
 
     header_metadata : Dict[str, str]
@@ -46,7 +52,6 @@ def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict
 
     reader = csv.reader(f, delimiter='\t')
     for line in reader:
-
         # Check for trailing Nones
         for index, cell in enumerate(reversed(line)):
             if cell:
@@ -89,7 +94,9 @@ def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict
 
             # If we have the report config section, check to see if it was loaded as a dict
             if line[0] == "ReportConfig" and type(header_metadata[line[0]]) != dict:
-                warnings.warn('Malformed ReportConfig section in header.  Setting to an empty dictionary.')
+                warnings.warn(
+                    'Malformed ReportConfig section in header.  Setting to an empty dictionary.'
+                )
                 header_metadata[line[0]] = {}
 
         elif current_section == 'COL_DATA':
@@ -110,10 +117,14 @@ def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict
             # Column Metadata Section
             if matrix_depth < col_metadata_length:
                 column_metadata_name = line[row_metadata_offset]
-                column_metadata_data = line[row_metadata_offset + 1:]
-
-                if column_metadata_name == 'SeqId' and re.match(r'\d{3,}-\d{1,3}_\d+', column_metadata_data[0]):
-                    warnings.warn('V3 style seqIds (i.e., 12345-6_7). Converting to V4 Style. The adat file writer has an option to write using the V3 style')
+                column_metadata_data = line[row_metadata_offset + 1 :]
+
+                if column_metadata_name == 'SeqId' and re.match(
+                    r'\d{3,}-\d{1,3}_\d+', column_metadata_data[0]
+                ):
+                    warnings.warn(
+                        'V3 style seqIds (i.e., 12345-6_7). Converting to V4 Style. The adat file writer has an option to write using the V3 style'
+                    )
                     seq_id_data = [x.split('_')[0] for x in column_metadata_data]
                     version_data = [x.split('_')[1] for x in column_metadata_data]
                     column_metadata[column_metadata_name] = seq_id_data
@@ -141,14 +152,13 @@ def parse_file(f: TextIO) -> Tuple[List[List[float]], Dict[str, List[str]], Dict
 
             # Row Metadata & RFU Section
             elif matrix_depth > col_metadata_length:
-
                 # Store in row metadata into dictionary
                 row_metadata_data = line[:row_metadata_offset]
                 for name, data in zip(row_metadata_names, row_metadata_data):
                     row_metadata[name].append(data)
 
                 # Store the RFU data
-                rfu_row_data = line[row_metadata_offset + 1:]
+                rfu_row_data = line[row_metadata_offset + 1 :]
                 converted_rfu_row_data = list(map(float, rfu_row_data))
                 rfu_matrix.append(converted_rfu_row_data)
 
@@ -160,7 +170,9 @@ def read_file(filepath: str) -> Adat:
 
     WILL BE REMOVED IN A FUTURE RELEASE
     """
-    logging.warning('THIS FUNCTION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE.\n PLEASE USE `canopy.read_adat` instead.')
+    logging.warning(
+        'THIS FUNCTION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE.\n PLEASE USE `canopy.read_adat` instead.'
+    )
     return read_adat(filepath)
 
 
@@ -184,26 +196,34 @@ def read_adat(path_or_buf: Union[str, TextIO]) -> Adat:
         with open(path_or_buf, 'r') as f:
             rfu_matrix, row_metadata, column_metadata, header_metadata = parse_file(f)
     else:
-        rfu_matrix, row_metadata, column_metadata, header_metadata = parse_file(path_or_buf)
+        rfu_matrix, row_metadata, column_metadata, header_metadata = parse_file(
+            path_or_buf
+        )
 
     return Adat.from_features(
         rfu_matrix=rfu_matrix,
         row_metadata=row_metadata,
         column_metadata=column_metadata,
-        header_metadata=header_metadata
+        header_metadata=header_metadata,
     )
 
 
-def write_file(adat, path: str, round_rfu: bool = True, convert_to_v3_seq_ids: bool = False) -> None:
+def write_file(
+    adat, path: str, round_rfu: bool = True, convert_to_v3_seq_ids: bool = False
+) -> None:
     """DEPRECATED: SEE canopy.write_adat
 
     WILL BE REMOVED IN A FUTURE RELEASE
     """
-    logging.warning('THIS FUNCTION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE.\n PLEASE USE `canopy.write_adat` instead.')
+    logging.warning(
+        'THIS FUNCTION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE.\n PLEASE USE `canopy.write_adat` instead.'
+    )
     read_adat(adat, path, round_rfu, convert_to_v3_seq_ids)
 
 
-def write_adat(adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: bool = False) -> None:
+def write_adat(
+    adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: bool = False
+) -> None:
     """Write this Adat to an adat format data source.
 
     Parameters
@@ -233,7 +253,7 @@ def write_adat(adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: b
     """
 
     # Add version number to header_metadata.  If the field already exists, append to it.
-    pkg_version = 'Canopy_' + pkg_resources.require('canopy')[0].version
+    pkg_version = 'Canopy_' + version('canopy')
     if '!GeneratedBy' not in adat.header_metadata:
         adat.header_metadata['!GeneratedBy'] = pkg_version
     elif pkg_version not in adat.header_metadata['!GeneratedBy']:
@@ -255,7 +275,6 @@ def write_adat(adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: b
     # Write HEADER section
     writer.writerow(['^HEADER'])
     for row in adat.header_metadata.items():
-
         # We need to handle the reportconfig in a special way since it has double quotes
         if row[0] == "ReportConfig":
             f.write(row[0] + '\t' + json.dumps(row[1], separators=(',', ':')) + '\r\n')
@@ -284,7 +303,10 @@ def write_adat(adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: b
         # Check if we are converting to the V3 style of adat seqIds
         if column_name == 'SeqId' and convert_to_v3_seq_ids:
             version_data = adat.columns.get_level_values('SeqIdVersion')
-            column_data = [seq_id + '_' + version for seq_id, version in zip(column_data, version_data)]
+            column_data = [
+                seq_id + '_' + version
+                for seq_id, version in zip(column_data, version_data)
+            ]
         if column_name == 'SeqIdVersion' and convert_to_v3_seq_ids:
             continue
 
@@ -299,10 +321,12 @@ def write_adat(adat, f: TextIO, round_rfu: bool = True, convert_to_v3_seq_ids: b
     extra_nones = len(adat.columns.get_level_values(column_names[0])) + 1
     writer.writerow(row_names + [None for x in range(extra_nones)])
 
-    # Write the row metadata and rfu matrix simulataneously
+    # Write the row metadata and rfu matrix simultaneously
     for i, rfu_row in enumerate(adat.values):
         # Prep the data
-        row_metadata = [adat.index.get_level_values(row_name)[i] for row_name in row_names]
+        row_metadata = [
+            adat.index.get_level_values(row_name)[i] for row_name in row_names
+        ]
         if round_rfu:
             rfu_row = [jround(rfu, 1) for rfu in rfu_row]
         else:

diff --git a/canopy/io/annotations/__init__.py b/canopy/io/annotations/__init__.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,5 +9,6 @@ class AdatKeyError(Exception): @@
     class AdatMetaError(Exception):
         pass
     class AnnotationsLiftingError(Exception):
         pass