Merge pull request #58 from saksham219/issue_36

Issue #36 (add sort option for parse gctx)
cmap · Nov 8, 2019 · 28817ff · 28817ff
2 parents a08ddeb + 5c9ea07
commit 28817ff
Show file tree

Hide file tree

Showing 31 changed files with 296 additions and 177 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,54 +10,45 @@ matrix:
   include:
     # run pandasGEXpress python2_tests      
     - python: "2.7"
-      env: TEST_DIR=cmapPy/pandasGEXpress/tests/python2_tests 
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python2_tests/
 
     # run pandasGEXpress python3_tests         
     - python: "3.6"
-      env: TEST_DIR=cmapPy/pandasGEXpress/tests/python3_tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python3_tests/
 
     # run set_io tests for python2    
     - python: "2.7"
-      env: TEST_DIR=cmapPy/set_io/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/
 
     # run set_io tests for python3    
     - python: "3.6"
-      env: TEST_DIR=cmapPy/set_io/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/
 
     # run math tests for python2
     - python: "2.7"
-      env: TEST_DIR=cmapPy/math/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/
 
      # run math tests for python3
     - python: "3.6"
-      env: TEST_DIR=cmapPy/math/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p "test_*.py"
+        - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/
 
     # run python2_python3_comaptibility tests for python2      
     - python: "2.7"
-      env: TEST_DIR=cmapPy/pandasGEXpress/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p test_python2_python3_compatibility.py
+        - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/
 
     # run python2_python3_comaptibility tests for python3        
     - python: "3.6"
-      env: TEST_DIR=cmapPy/pandasGEXpress/tests
       script:
-        - cd $TEST_DIR && python -m unittest discover -p test_python2_python3_compatibility.py
+        - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/
 
 # what branches of github to use
 branches:
   only:
-    - master
-    - travis_testing
+    - master
diff --git a/cmapPy/math/tests/__init__.py b/cmapPy/math/tests/__init__.py
diff --git a/cmapPy/pandasGEXpress/mini_gctoo_for_testing.py b/cmapPy/pandasGEXpress/mini_gctoo_for_testing.py
@@ -47,7 +47,7 @@ def make(convert_neg_666=True):
     if convert_neg_666:
         mini_row_metadata = mini_row_metadata.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan])
         # if all values in a column are nanpandas.Series(mini_row_metadata.isna().sum() == mini_row_metadata.shape[0]) convert dtype of that column to float
-        all_nan_columns = numpy.array(mini_row_metadata.isnull().sum() == mini_row_metadata.shape[0]).nonzero()[0]
+        all_nan_columns = (mini_row_metadata.isnull().sum() == numpy.array(mini_row_metadata.shape[0])).nonzero()[0]
         mini_row_metadata = mini_row_metadata.astype({d: 'float' for d in mini_row_metadata.columns[all_nan_columns.tolist()]})
     else:
         mini_row_metadata = mini_row_metadata.replace([-666, -666.0], ["-666", "-666"])

diff --git a/cmapPy/pandasGEXpress/parse_gctx.py b/cmapPy/pandasGEXpress/parse_gctx.py
@@ -21,7 +21,8 @@
 
 
 def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
-          ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False):
+          ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False,
+          sort_col_meta = True, sort_row_meta = True):
     """
     Primary method of script. Reads in path to a gctx file and parses into GCToo object.
 
@@ -44,7 +45,8 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
             as pandas DataFrame
         - make_multiindex (bool): whether to create a multi-index df combining
             the 3 component dfs
-
+        - sort_col_meta (bool) : whether to sort the column metadata by indexes. Default = True
+        - sort_row_meta (bool) : whether to sort the row metadata by indexes. Default = True
     Output:
         - myGCToo (GCToo): A GCToo instance containing content of parsed gctx file. Note: if meta_only = True,
             this will be a GCToo instance where the data_df is empty, i.e. data_df = pd.DataFrame(index=rids,
@@ -74,25 +76,39 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         row_meta = parse_metadata_df("row", row_dset, convert_neg_666)
 
         # validate optional input ids & get indexes to subset by
-        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None)
+        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None, 
+                                                                sort_row_meta = True, sort_col_meta = True)
 
         gctx_file.close()
 
         # subset if specified, then return
         row_meta = row_meta.iloc[sorted_ridx]
+
+        if not sort_row_meta:
+            (unsorted_ridx, _) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None,
+                                                      sort_row_meta, sort_row_meta)
+            row_meta = row_meta.iloc[unsorted_ridx]
+
         return row_meta
     elif col_meta_only:
         # read in col metadata
         col_dset = gctx_file[col_meta_group_node]
         col_meta = parse_metadata_df("col", col_dset, convert_neg_666)
 
         # validate optional input ids & get indexes to subset by
-        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None, col_meta)
+        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None, 
+                                                            col_meta, sort_row_meta = True, sort_col_meta = True)
 
         gctx_file.close()
 
         # subset if specified, then return
         col_meta = col_meta.iloc[sorted_cidx]
+
+        if not sort_col_meta:
+            (_, unsorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None, col_meta, 
+                                                        sort_row_meta, sort_col_meta)
+            col_meta = col_meta.iloc[unsorted_cidx, :]
+
         return col_meta
     else:
         # read in row metadata
@@ -104,7 +120,8 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         col_meta = parse_metadata_df("col", col_dset, convert_neg_666)
 
         # validate optional input ids & get indexes to subset by
-        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta)
+        (sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta, 
+                                                                sort_row_meta = True, sort_col_meta = True)
 
         data_dset = gctx_file[data_node]
         data_df = parse_data_df(data_dset, sorted_ridx, sorted_cidx, row_meta, col_meta)
@@ -113,6 +130,20 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         row_meta = row_meta.iloc[sorted_ridx]
         col_meta = col_meta.iloc[sorted_cidx]
 
+        if not sort_col_meta:
+            ## in the subsetted and re-indexed dataframe get where new indexes lie
+            (_, unsorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta, 
+                                                        sort_row_meta, sort_col_meta)
+
+            data_df = data_df.iloc[:,unsorted_cidx]
+            col_meta = col_meta.iloc[unsorted_cidx,:]
+
+        if not sort_row_meta:
+            (unsorted_ridx, _) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta,
+                                                      sort_row_meta, sort_row_meta)
+            data_df = data_df.iloc[unsorted_ridx,:]
+            row_meta = row_meta.iloc[unsorted_ridx,:]
+
         # get version
         my_version = gctx_file.attrs[version_node]
         if type(my_version) == np.ndarray:
@@ -126,25 +157,28 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         return my_gctoo
 
 
-def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df):
+def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df, sort_row_meta, sort_col_meta):
     """
     Makes sure that (if entered) id inputs entered are of one type (string id or index)
     Input:
         - rid (list or None): if not None, a list of rids
         - ridx (list or None): if not None, a list of indexes
         - cid (list or None): if not None, a list of cids
         - cidx (list or None): if not None, a list of indexes
+        - sort_row_meta (bool): boolean indicating whether to return sorted row indexes
+        - sort_col_meta (bool): boolean indicating whether to return sorted column indexes
     Output:
         - a tuple of the ordered ridx and cidx
     """
     (row_type, row_ids) = check_id_idx_exclusivity(rid, ridx)
     (col_type, col_ids) = check_id_idx_exclusivity(cid, cidx)
 
-    row_ids = check_and_convert_ids(row_type, row_ids, row_meta_df)
-    ordered_ridx = get_ordered_idx(row_type, row_ids, row_meta_df)
 
-    col_ids = check_and_convert_ids(col_type, col_ids, col_meta_df)
-    ordered_cidx = get_ordered_idx(col_type, col_ids, col_meta_df)
+    row_ids = check_and_convert_ids(row_type, row_ids, row_meta_df, sort_col_meta)
+    ordered_ridx = get_ordered_idx(row_type, row_ids, row_meta_df, sort_row_meta)
+
+    col_ids = check_and_convert_ids(col_type, col_ids, col_meta_df, sort_col_meta)
+    ordered_cidx = get_ordered_idx(col_type, col_ids, col_meta_df, sort_col_meta)
     return (ordered_ridx, ordered_cidx)
 
 
@@ -172,13 +206,13 @@ def check_id_idx_exclusivity(id, idx):
         return (None, [])
 
 
-def check_and_convert_ids(id_type, id_list, meta_df):
+def check_and_convert_ids(id_type, id_list, meta_df, sort_id):
     if meta_df is not None:
         if id_type == "id":
             id_list = convert_ids_to_meta_type(id_list, meta_df)
             check_id_validity(id_list, meta_df)
         else:
-            check_idx_validity(id_list, meta_df)
+            check_idx_validity(id_list, meta_df, sort_id)
         return id_list
     else:
         return None
@@ -195,14 +229,15 @@ def check_id_validity(id_list, meta_df):
         raise Exception("parse_gctx check_id_validity " + msg)
 
 
-def check_idx_validity(id_list, meta_df):
-    N = meta_df.shape[0]
-    out_of_range_ids = [my_id for my_id in id_list if my_id < 0 or my_id >= N]
-    if len(out_of_range_ids):
-        msg = "some of indexes being used to subset the data are not valid max N:  {}  out_of_range_ids:  {}".format(N,
-                                                                                                                     out_of_range_ids)
-        logger.error(msg)
-        raise Exception("parse_gctx check_idx_validity " + msg)
+def check_idx_validity(id_list, meta_df, sort_id):
+    if sort_id:
+        N = meta_df.shape[0]
+        out_of_range_ids = [my_id for my_id in id_list if my_id < 0 or my_id >= N]
+        if len(out_of_range_ids):
+            msg = "some of indexes being used to subset the data are not valid max N:  {}  out_of_range_ids:  {}".format(N,
+                                                                                                     out_of_range_ids)
+            logger.error(msg)
+            raise Exception("parse_gctx check_idx_validity " + msg)
 
 
 def convert_ids_to_meta_type(id_list, meta_df):
@@ -216,12 +251,14 @@ def convert_ids_to_meta_type(id_list, meta_df):
         raise Exception("parse_gctx check_if_ids_in_meta " + msg + "  ValueError ve:  {}".format(ve))
 
 
-def get_ordered_idx(id_type, id_list, meta_df):
+def get_ordered_idx(id_type, id_list, meta_df, sort_idx):
     """
     Gets index values corresponding to ids to subset and orders them.
     Input:
         - id_type (str): either "id", "idx" or None
         - id_list (list): either a list of indexes or id names
+        - meta_df (dataframe): dataframe 
+        - sort_idx (bool): boolean indicating whether to return sorted indexes or not
     Output:
         - a sorted list of indexes to subset a dimension by
     """
@@ -231,6 +268,8 @@ def get_ordered_idx(id_type, id_list, meta_df):
         elif id_type == "id":
             lookup = {x: i for (i,x) in enumerate(meta_df.index)}
             id_list = [lookup[str(i)] for i in id_list]
+        if not sort_idx:
+            return [sorted(id_list).index(i) for i in id_list] 
         return sorted(id_list)
     else:
         return None

diff --git a/cmapPy/pandasGEXpress/tests/__init__.py b/cmapPy/pandasGEXpress/tests/__init__.py
diff --git a/cmapPy/pandasGEXpress/tests/python2_tests/__init__.py b/cmapPy/pandasGEXpress/tests/python2_tests/__init__.py
diff --git a/cmapPy/pandasGEXpress/tests/python2_tests/test_concat.py b/cmapPy/pandasGEXpress/tests/python2_tests/test_concat.py
@@ -10,7 +10,7 @@
 
 
 logger = logging.getLogger(setup_logger.LOGGER_NAME)
-FUNCTIONAL_TESTS_DIR = "../functional_tests"
+FUNCTIONAL_TESTS_DIR = "cmapPy/pandasGEXpress/tests/functional_tests/"
 
 
 class TestConcat(unittest.TestCase):
@@ -333,7 +333,7 @@ def test_build_mismatched_common_meta_report(self):
         self.assertEqual({"r3"}, set(r.orig_rid))
 
     def test_concat_main(self):
-        test_dir = "../functional_tests/test_concat/test_main"
+        test_dir = "cmapPy/pandasGEXpress/tests/functional_tests//test_concat/test_main"
 
         g_a = pg.parse(os.path.join(test_dir, "a.gct"))
         logger.debug("g_a:  {}".format(g_a))

diff --git a/cmapPy/pandasGEXpress/tests/python2_tests/test_edge_cases.py b/cmapPy/pandasGEXpress/tests/python2_tests/test_edge_cases.py
@@ -10,7 +10,7 @@
 import pandas.util.testing as pandas_testing
 
 
-FUNCTIONAL_TESTS_PATH = "../functional_tests"
+FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/"
 
 logger = logging.getLogger(setup_logger.LOGGER_NAME)
 

diff --git a/cmapPy/pandasGEXpress/tests/python2_tests/test_gct2gctx.py b/cmapPy/pandasGEXpress/tests/python2_tests/test_gct2gctx.py
@@ -14,8 +14,8 @@ class TestGCT2GCTx(unittest.TestCase):
 
 	def test_gct2gctx_main(self):
 
-		in_name = "../functional_tests/mini_gctoo_for_testing.gct"
-		out_name = "../functional_tests/test_gct2gctx_out.gctx"
+		in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct"
+		out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out.gctx"
 		args_string = "-f {} -o {}".format(in_name, out_name)
 		args = gct2gctx.build_parser().parse_args(args_string.split())
 
@@ -29,10 +29,10 @@ def test_gct2gctx_main(self):
 		pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df)
 		pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df)
 
-		no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
-		added_meta = "../functional_tests/test_gct2gctx_out_annotated.gctx"
-		row_meta = "../functional_tests/test_rowmeta_n6.txt"
-		col_meta = "../functional_tests/test_colmeta_n6.txt"
+		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
+		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out_annotated.gctx"
+		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
+		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta)
 		args = gct2gctx.build_parser().parse_args(args_string.split())
 
@@ -51,9 +51,9 @@ def test_gct2gctx_main(self):
 
 	def test_missing_annotations(self):
 		with self.assertRaises(Exception) as context:
-			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
-			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
-			row_meta = "../functional_tests/test_missing_rowmeta.txt"
+			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
+			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
+			row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
 			args = gct2gctx.build_parser().parse_args(args_string.split())
 
@@ -62,9 +62,9 @@ def test_missing_annotations(self):
 		self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)
 
 		with self.assertRaises(Exception) as context:
-			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
-			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
-			col_meta = "../functional_tests/test_missing_colmeta.txt"
+			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
+			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
+			col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
 			args = gct2gctx.build_parser().parse_args(args_string.split())
 

diff --git a/cmapPy/pandasGEXpress/tests/python2_tests/test_gctx2gct.py b/cmapPy/pandasGEXpress/tests/python2_tests/test_gctx2gct.py
@@ -14,8 +14,8 @@ class TestGCTx2GCT(unittest.TestCase):
 
 	def test_gctx2gct_main(self):
 
-		in_name = "../functional_tests/mini_gctoo_for_testing.gctx"
-		out_name = "../functional_tests/test_gctx2gct_out.gct"
+		in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx"
+		out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out.gct"
 		args_string = "-f {} -o {}".format(in_name, out_name)
 		args = gctx2gct.build_parser().parse_args(args_string.split())
 
@@ -29,10 +29,10 @@ def test_gctx2gct_main(self):
 		pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df)
 		pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df)
 
-		no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
-		added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
-		row_meta = "../functional_tests/test_rowmeta_n6.txt"
-		col_meta = "../functional_tests/test_colmeta_n6.txt"
+		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
+		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
+		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
+		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta )
 		args = gctx2gct.build_parser().parse_args(args_string.split())
 
@@ -51,9 +51,9 @@ def test_gctx2gct_main(self):
 
 	def test_missing_annotations(self):
 		with self.assertRaises(Exception) as context:
-			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
-			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
-			row_meta = "../functional_tests/test_missing_rowmeta.txt"
+			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
+			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
+			row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
 			args = gctx2gct.build_parser().parse_args(args_string.split())
 
@@ -62,9 +62,9 @@ def test_missing_annotations(self):
 		self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)
 
 		with self.assertRaises(Exception) as context:
-			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
-			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
-			col_meta = "../functional_tests/test_missing_colmeta.txt"
+			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
+			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
+			col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
 			args = gctx2gct.build_parser().parse_args(args_string.split())