Skip to content

Commit

Permalink
Merge pull request #58 from saksham219/issue_36
Browse files Browse the repository at this point in the history
Issue #36 (add sort option for parse gctx)
  • Loading branch information
tnat1031 authored Nov 8, 2019
2 parents a08ddeb + 5c9ea07 commit 28817ff
Show file tree
Hide file tree
Showing 31 changed files with 296 additions and 177 deletions.
27 changes: 9 additions & 18 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,54 +10,45 @@ matrix:
include:
# run pandasGEXpress python2_tests
- python: "2.7"
env: TEST_DIR=cmapPy/pandasGEXpress/tests/python2_tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python2_tests/

# run pandasGEXpress python3_tests
- python: "3.6"
env: TEST_DIR=cmapPy/pandasGEXpress/tests/python3_tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python3_tests/

# run set_io tests for python2
- python: "2.7"
env: TEST_DIR=cmapPy/set_io/tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/

# run set_io tests for python3
- python: "3.6"
env: TEST_DIR=cmapPy/set_io/tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/

# run math tests for python2
- python: "2.7"
env: TEST_DIR=cmapPy/math/tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/

# run math tests for python3
- python: "3.6"
env: TEST_DIR=cmapPy/math/tests
script:
- cd $TEST_DIR && python -m unittest discover -p "test_*.py"
- python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/

# run python2_python3_comaptibility tests for python2
- python: "2.7"
env: TEST_DIR=cmapPy/pandasGEXpress/tests
script:
- cd $TEST_DIR && python -m unittest discover -p test_python2_python3_compatibility.py
- python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/

# run python2_python3_comaptibility tests for python3
- python: "3.6"
env: TEST_DIR=cmapPy/pandasGEXpress/tests
script:
- cd $TEST_DIR && python -m unittest discover -p test_python2_python3_compatibility.py
- python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/

# what branches of github to use
branches:
only:
- master
- travis_testing
- master
Empty file added cmapPy/math/tests/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion cmapPy/pandasGEXpress/mini_gctoo_for_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def make(convert_neg_666=True):
if convert_neg_666:
mini_row_metadata = mini_row_metadata.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan])
# if all values in a column are nanpandas.Series(mini_row_metadata.isna().sum() == mini_row_metadata.shape[0]) convert dtype of that column to float
all_nan_columns = numpy.array(mini_row_metadata.isnull().sum() == mini_row_metadata.shape[0]).nonzero()[0]
all_nan_columns = (mini_row_metadata.isnull().sum() == numpy.array(mini_row_metadata.shape[0])).nonzero()[0]
mini_row_metadata = mini_row_metadata.astype({d: 'float' for d in mini_row_metadata.columns[all_nan_columns.tolist()]})
else:
mini_row_metadata = mini_row_metadata.replace([-666, -666.0], ["-666", "-666"])
Expand Down
81 changes: 60 additions & 21 deletions cmapPy/pandasGEXpress/parse_gctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@


def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False):
ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False,
sort_col_meta = True, sort_row_meta = True):
"""
Primary method of script. Reads in path to a gctx file and parses into GCToo object.
Expand All @@ -44,7 +45,8 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
as pandas DataFrame
- make_multiindex (bool): whether to create a multi-index df combining
the 3 component dfs
- sort_col_meta (bool) : whether to sort the column metadata by indexes. Default = True
- sort_row_meta (bool) : whether to sort the row metadata by indexes. Default = True
Output:
- myGCToo (GCToo): A GCToo instance containing content of parsed gctx file. Note: if meta_only = True,
this will be a GCToo instance where the data_df is empty, i.e. data_df = pd.DataFrame(index=rids,
Expand Down Expand Up @@ -74,25 +76,39 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
row_meta = parse_metadata_df("row", row_dset, convert_neg_666)

# validate optional input ids & get indexes to subset by
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None)
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None,
sort_row_meta = True, sort_col_meta = True)

gctx_file.close()

# subset if specified, then return
row_meta = row_meta.iloc[sorted_ridx]

if not sort_row_meta:
(unsorted_ridx, _) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, None,
sort_row_meta, sort_row_meta)
row_meta = row_meta.iloc[unsorted_ridx]

return row_meta
elif col_meta_only:
# read in col metadata
col_dset = gctx_file[col_meta_group_node]
col_meta = parse_metadata_df("col", col_dset, convert_neg_666)

# validate optional input ids & get indexes to subset by
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None, col_meta)
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None,
col_meta, sort_row_meta = True, sort_col_meta = True)

gctx_file.close()

# subset if specified, then return
col_meta = col_meta.iloc[sorted_cidx]

if not sort_col_meta:
(_, unsorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, None, col_meta,
sort_row_meta, sort_col_meta)
col_meta = col_meta.iloc[unsorted_cidx, :]

return col_meta
else:
# read in row metadata
Expand All @@ -104,7 +120,8 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
col_meta = parse_metadata_df("col", col_dset, convert_neg_666)

# validate optional input ids & get indexes to subset by
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta)
(sorted_ridx, sorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta,
sort_row_meta = True, sort_col_meta = True)

data_dset = gctx_file[data_node]
data_df = parse_data_df(data_dset, sorted_ridx, sorted_cidx, row_meta, col_meta)
Expand All @@ -113,6 +130,20 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
row_meta = row_meta.iloc[sorted_ridx]
col_meta = col_meta.iloc[sorted_cidx]

if not sort_col_meta:
## in the subsetted and re-indexed dataframe get where new indexes lie
(_, unsorted_cidx) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta,
sort_row_meta, sort_col_meta)

data_df = data_df.iloc[:,unsorted_cidx]
col_meta = col_meta.iloc[unsorted_cidx,:]

if not sort_row_meta:
(unsorted_ridx, _) = check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta, col_meta,
sort_row_meta, sort_row_meta)
data_df = data_df.iloc[unsorted_ridx,:]
row_meta = row_meta.iloc[unsorted_ridx,:]

# get version
my_version = gctx_file.attrs[version_node]
if type(my_version) == np.ndarray:
Expand All @@ -126,25 +157,28 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
return my_gctoo


def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df):
def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df, sort_row_meta, sort_col_meta):
"""
Makes sure that (if entered) id inputs entered are of one type (string id or index)
Input:
- rid (list or None): if not None, a list of rids
- ridx (list or None): if not None, a list of indexes
- cid (list or None): if not None, a list of cids
- cidx (list or None): if not None, a list of indexes
- sort_row_meta (bool): boolean indicating whether to return sorted row indexes
- sort_col_meta (bool): boolean indicating whether to return sorted column indexes
Output:
- a tuple of the ordered ridx and cidx
"""
(row_type, row_ids) = check_id_idx_exclusivity(rid, ridx)
(col_type, col_ids) = check_id_idx_exclusivity(cid, cidx)

row_ids = check_and_convert_ids(row_type, row_ids, row_meta_df)
ordered_ridx = get_ordered_idx(row_type, row_ids, row_meta_df)

col_ids = check_and_convert_ids(col_type, col_ids, col_meta_df)
ordered_cidx = get_ordered_idx(col_type, col_ids, col_meta_df)
row_ids = check_and_convert_ids(row_type, row_ids, row_meta_df, sort_col_meta)
ordered_ridx = get_ordered_idx(row_type, row_ids, row_meta_df, sort_row_meta)

col_ids = check_and_convert_ids(col_type, col_ids, col_meta_df, sort_col_meta)
ordered_cidx = get_ordered_idx(col_type, col_ids, col_meta_df, sort_col_meta)
return (ordered_ridx, ordered_cidx)


Expand Down Expand Up @@ -172,13 +206,13 @@ def check_id_idx_exclusivity(id, idx):
return (None, [])


def check_and_convert_ids(id_type, id_list, meta_df):
def check_and_convert_ids(id_type, id_list, meta_df, sort_id):
if meta_df is not None:
if id_type == "id":
id_list = convert_ids_to_meta_type(id_list, meta_df)
check_id_validity(id_list, meta_df)
else:
check_idx_validity(id_list, meta_df)
check_idx_validity(id_list, meta_df, sort_id)
return id_list
else:
return None
Expand All @@ -195,14 +229,15 @@ def check_id_validity(id_list, meta_df):
raise Exception("parse_gctx check_id_validity " + msg)


def check_idx_validity(id_list, meta_df):
N = meta_df.shape[0]
out_of_range_ids = [my_id for my_id in id_list if my_id < 0 or my_id >= N]
if len(out_of_range_ids):
msg = "some of indexes being used to subset the data are not valid max N: {} out_of_range_ids: {}".format(N,
out_of_range_ids)
logger.error(msg)
raise Exception("parse_gctx check_idx_validity " + msg)
def check_idx_validity(id_list, meta_df, sort_id):
if sort_id:
N = meta_df.shape[0]
out_of_range_ids = [my_id for my_id in id_list if my_id < 0 or my_id >= N]
if len(out_of_range_ids):
msg = "some of indexes being used to subset the data are not valid max N: {} out_of_range_ids: {}".format(N,
out_of_range_ids)
logger.error(msg)
raise Exception("parse_gctx check_idx_validity " + msg)


def convert_ids_to_meta_type(id_list, meta_df):
Expand All @@ -216,12 +251,14 @@ def convert_ids_to_meta_type(id_list, meta_df):
raise Exception("parse_gctx check_if_ids_in_meta " + msg + " ValueError ve: {}".format(ve))


def get_ordered_idx(id_type, id_list, meta_df):
def get_ordered_idx(id_type, id_list, meta_df, sort_idx):
"""
Gets index values corresponding to ids to subset and orders them.
Input:
- id_type (str): either "id", "idx" or None
- id_list (list): either a list of indexes or id names
- meta_df (dataframe): dataframe
- sort_idx (bool): boolean indicating whether to return sorted indexes or not
Output:
- a sorted list of indexes to subset a dimension by
"""
Expand All @@ -231,6 +268,8 @@ def get_ordered_idx(id_type, id_list, meta_df):
elif id_type == "id":
lookup = {x: i for (i,x) in enumerate(meta_df.index)}
id_list = [lookup[str(i)] for i in id_list]
if not sort_idx:
return [sorted(id_list).index(i) for i in id_list]
return sorted(id_list)
else:
return None
Expand Down
Empty file.
Empty file.
4 changes: 2 additions & 2 deletions cmapPy/pandasGEXpress/tests/python2_tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


logger = logging.getLogger(setup_logger.LOGGER_NAME)
FUNCTIONAL_TESTS_DIR = "../functional_tests"
FUNCTIONAL_TESTS_DIR = "cmapPy/pandasGEXpress/tests/functional_tests/"


class TestConcat(unittest.TestCase):
Expand Down Expand Up @@ -333,7 +333,7 @@ def test_build_mismatched_common_meta_report(self):
self.assertEqual({"r3"}, set(r.orig_rid))

def test_concat_main(self):
test_dir = "../functional_tests/test_concat/test_main"
test_dir = "cmapPy/pandasGEXpress/tests/functional_tests//test_concat/test_main"

g_a = pg.parse(os.path.join(test_dir, "a.gct"))
logger.debug("g_a: {}".format(g_a))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas.util.testing as pandas_testing


FUNCTIONAL_TESTS_PATH = "../functional_tests"
FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/"

logger = logging.getLogger(setup_logger.LOGGER_NAME)

Expand Down
24 changes: 12 additions & 12 deletions cmapPy/pandasGEXpress/tests/python2_tests/test_gct2gctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class TestGCT2GCTx(unittest.TestCase):

def test_gct2gctx_main(self):

in_name = "../functional_tests/mini_gctoo_for_testing.gct"
out_name = "../functional_tests/test_gct2gctx_out.gctx"
in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct"
out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out.gctx"
args_string = "-f {} -o {}".format(in_name, out_name)
args = gct2gctx.build_parser().parse_args(args_string.split())

Expand All @@ -29,10 +29,10 @@ def test_gct2gctx_main(self):
pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df)
pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df)

no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
added_meta = "../functional_tests/test_gct2gctx_out_annotated.gctx"
row_meta = "../functional_tests/test_rowmeta_n6.txt"
col_meta = "../functional_tests/test_colmeta_n6.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out_annotated.gctx"
row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta)
args = gct2gctx.build_parser().parse_args(args_string.split())

Expand All @@ -51,9 +51,9 @@ def test_gct2gctx_main(self):

def test_missing_annotations(self):
with self.assertRaises(Exception) as context:
no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
row_meta = "../functional_tests/test_missing_rowmeta.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
args = gct2gctx.build_parser().parse_args(args_string.split())

Expand All @@ -62,9 +62,9 @@ def test_missing_annotations(self):
self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)

with self.assertRaises(Exception) as context:
no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
col_meta = "../functional_tests/test_missing_colmeta.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
args = gct2gctx.build_parser().parse_args(args_string.split())

Expand Down
24 changes: 12 additions & 12 deletions cmapPy/pandasGEXpress/tests/python2_tests/test_gctx2gct.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class TestGCTx2GCT(unittest.TestCase):

def test_gctx2gct_main(self):

in_name = "../functional_tests/mini_gctoo_for_testing.gctx"
out_name = "../functional_tests/test_gctx2gct_out.gct"
in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx"
out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out.gct"
args_string = "-f {} -o {}".format(in_name, out_name)
args = gctx2gct.build_parser().parse_args(args_string.split())

Expand All @@ -29,10 +29,10 @@ def test_gctx2gct_main(self):
pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df)
pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df)

no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
row_meta = "../functional_tests/test_rowmeta_n6.txt"
col_meta = "../functional_tests/test_colmeta_n6.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta )
args = gctx2gct.build_parser().parse_args(args_string.split())

Expand All @@ -51,9 +51,9 @@ def test_gctx2gct_main(self):

def test_missing_annotations(self):
with self.assertRaises(Exception) as context:
no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
row_meta = "../functional_tests/test_missing_rowmeta.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
args = gctx2gct.build_parser().parse_args(args_string.split())

Expand All @@ -62,9 +62,9 @@ def test_missing_annotations(self):
self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)

with self.assertRaises(Exception) as context:
no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gctx"
added_meta = "../functional_tests/test_gctx2gct_out_annotated.gct"
col_meta = "../functional_tests/test_missing_colmeta.txt"
no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
args = gctx2gct.build_parser().parse_args(args_string.split())

Expand Down
Loading

0 comments on commit 28817ff

Please sign in to comment.