updated all files, should solve issue with proteomic

jclachance · Nov 21, 2017 · ca1f675 · ca1f675
1 parent 4f452a1
commit ca1f675
Show file tree

Hide file tree

Showing 46 changed files with 63,582 additions and 3,180 deletions.
diff --git a/BOFdat/dna.py b/BOFdat/dna.py
@@ -56,6 +56,8 @@ def _convert_to_coefficient(model, ratio_genome, CELL_WEIGHT, DNA_RATIO):
     # Transform the ratios into mmol/gDW
     DNA_WEIGHT = CELL_WEIGHT * DNA_RATIO
 
+    DIPHOSPHATE_WEIGHT = 174.951262
+
     base_to_bigg = {'A': model.metabolites.datp_c, 'T': model.metabolites.dttp_c,
                     'C': model.metabolites.dctp_c, 'G': model.metabolites.dgtp_c}
     coefficients,metabolites = [],[]
@@ -65,7 +67,7 @@ def _convert_to_coefficient(model, ratio_genome, CELL_WEIGHT, DNA_RATIO):
         ratio = ratio_genome.get(letter)
         total_weight = ratio * DNA_WEIGHT
         metab = base_to_bigg.get(letter)
-        mol_weight = metab.formula_weight
+        mol_weight = metab.formula_weight - DIPHOSPHATE_WEIGHT
         mmols_per_cell = (total_weight / mol_weight) * 1000
         mmols_per_gDW = mmols_per_cell / CELL_WEIGHT
         coefficients.append(mmols_per_gDW)

diff --git a/BOFdat/dna.pyc b/BOFdat/dna.pyc
diff --git a/BOFdat/lipid.py b/BOFdat/lipid.py
@@ -5,47 +5,50 @@
 This module generates BOFsc for the lipid content of the cell.
 
 """
+
+
+def _import_model(path_to_model):
+    import cobra
+    extension = path_to_model.split('.')[-1]
+    if extension == 'json':
+        model = cobra.io.load_json_model(path_to_model)
+    elif extension == 'xml':
+        model = cobra.io.read_sbml_model(path_to_model)
+    else:
+        print('Model format type not supported')
+    return model
+
 def filter_for_model_metab(path_to_conversion_file, path_to_model):
     """
 
     :param path_to_conversion_file: a dictionary converting from the name present in the lipidomic data to BiGG identifiers. This dictionary is generated through manual curation from the modeller.
 
     :param path_to_model: a path to the model, format supported are json and xml
 
-    :return: updated dictionary with metabolites found in the model
+    :return: updated dataframe with metabolites found in the model
     """
-
-    def import_model(path_to_model):
-        import cobra
-        extension = path_to_model.split('.')[-1]
-        if extension == 'json':
-            model = cobra.io.load_json_model(path_to_model)
-        elif extension == 'xml':
-            model = cobra.io.read_sbml_model(path_to_model)
-        else:
-            print('Model format type not supported')
-        return model
-
+    import pandas as pd
     # Get the model
-    model = import_model(path_to_model)
+    model = _import_model(path_to_model)
     # Get the metabolites in model
     model_metab_id = [m.id for m in model.metabolites]
     # Get to_bigg_dict
-    import pandas as pd
     to_bigg_df = pd.read_csv(path_to_conversion_file)
     to_bigg_dict = dict(zip([i for i in to_bigg_df[to_bigg_df.columns[0]]],
                             [i for i in to_bigg_df[to_bigg_df.columns[1]]]))
+
     # Get the metabolites that are in the model
-    model_metab = {k: v for k, v in to_bigg_dict.iteritems() if k in model_metab_id}
+    model_metab = {k: v for k, v in to_bigg_dict.iteritems() if v in model_metab_id}
 
     # Get the metabolites that are not in the model but present in OMICs data
-    non_model_metab = [k for k in to_bigg_dict.keys() if k not in model_metab_id]
+    non_model_metab = [k for k,v in to_bigg_dict.iteritems() if v not in model_metab_id]
     if len(non_model_metab) != 0:
-        print("These metabolites were not found in the model but were present in your metabolomic data, "
+        print("These lipids were not found in the model but were present in your lipidomic data, "
                      "consider adding them to your model: %s " % ([metab for metab in non_model_metab]))
 
-    return model_metab
+    model_metab_df = pd.DataFrame({'lipid_name':model_metab.keys(),'lipid_id':model_metab.values()},columns=['lipid_name','lipid_id'])
 
+    return model_metab_df
 
 def generate_coefficients(path_to_lipidomic,path_to_bigg_dict,
                      path_to_model,
@@ -83,24 +86,10 @@ def make_compliant_lipidomic(path_to_lipidomic):
 
     def make_compliant_bigg(path_to_bigg_dict):
         import pandas as pd
-        df = pd.read_csv(path_to_bigg_dict, names=['lipid_name','lipid_id'],skiprows=1)
-        keys = [i for i in df.lipid_name]
-        values = [i for i in df.lipid_id]
-
-        return dict(zip(keys,values))
-
-    #Operation 0.3
-    def import_model(path_to_model):
-        import cobra
-        extension = path_to_model.split('.')[-1]
-        if extension == 'json':
-            model = cobra.io.load_json_model(path_to_model)
-        elif extension == 'xml':
-            model = cobra.io.read_sbml_model(path_to_model)
-        return model
+        return pd.read_csv(path_to_bigg_dict, names=['lipid_name','lipid_id'],skiprows=1)
 
     # Operation 1
-    def convert_lipidomics_to_bigg(lipidomic,to_bigg_dict):
+    def convert_lipidomics_to_bigg(lipid_abun,lipid_conv):
         """
         This function generates a dictionary of BiGG identifiers that were generated through manual curation of the user
         with their relative abundances.
@@ -112,20 +101,18 @@ def convert_lipidomics_to_bigg(lipidomic,to_bigg_dict):
         :return: a dictionary containing BiGG identifiers and their relative abundances
         """
         import pandas as pd
-        #Generate the dictionary
-        keys,values = [],[]
+        #Generate the dictionary of lipid_id and relative abundances
+        df = pd.merge(left=lipid_conv, right=lipid_abun, on='lipid_name')
+        df1 = pd.concat([df.lipid_id, df.abundance], axis=1)
+        grouped = df1.groupby('lipid_id').agg(lambda x: sum(x))
 
-        for i,row in lipidomic.iterrows():
-            keys.append(to_bigg_dict.get(row.lipid_name))
-            values.append(row.abundance)
-
-        return dict(zip(keys,values))
+        return dict(zip([i for i in grouped.index], [i for i in grouped.abundance]))
 
     # Operation 2
     def get_relative_abundance(bigg_abundance):
         # Calculate relative abundances
         total_peak = sum(bigg_abundance.values())
-        return {k: v / total_peak for k, v in bigg_abundance.iteritems()}
+        return {k: float(v) / total_peak for k, v in bigg_abundance.iteritems()}
 
     # Operation 3
     def get_lipid_weight(model,compound_list,R_WEIGHT):
@@ -191,11 +178,11 @@ def calculate_coefficients(weight_dict,relative_abundance,LIPID_WEIGHT,CELL_WEIG
     #0.2- Make data compliant for the rest of the functions
     lipidomic_compliant = make_compliant_lipidomic(path_to_lipidomic)
     bigg_compliant = make_compliant_bigg(path_to_bigg_dict)
+
     #0.3- Get the model
-    model = import_model(path_to_model)
+    model = _import_model(path_to_model)
     #1- Generate a dictionary of BiGG IDs and relative abundances
     bigg_abundance = convert_lipidomics_to_bigg(lipidomic_compliant, bigg_compliant)
-
     #2- Get the relative abundance of each lipid
     rel_abundance = get_relative_abundance(bigg_abundance)
     #3- Get the weight of each lipid specie

diff --git a/BOFdat/lipid.pyc b/BOFdat/lipid.pyc
diff --git a/BOFdat/protein.py b/BOFdat/protein.py
@@ -54,8 +54,11 @@ def _import_model(path_to_model):
 def _import_proteomic(path_to_proteomic):
     import pandas as pd
     proteomics =pd.read_csv(path_to_proteomic, names=['gene_ID', 'Mean'], skiprows=1)
-    keys = [k for k in proteomics.gene_ID]
-    values = [v for v in proteomics.Mean]
+    if proteomics.isnull().values.any():
+        print('Some proteins in your dataset do not have associated abundance, removing them')
+        proteomics = proteomics.dropna()
+    keys = [str(k) for k in proteomics.gene_ID]
+    values = [float(v) for v in proteomics.Mean]
     return dict(zip(keys, values))
 
 def _get_aa_composition(seq_dict):
@@ -67,7 +70,7 @@ def _get_aa_composition(seq_dict):
     # Keys = amino acid by letter code
     # Values = the occurence of that amino acid
     list_of_dict = []
-    for k,v seq_dict.iteritems():
+    for k,v in seq_dict.iteritems():
         list_of_occurences = []
         # Get the occurence for each letter
         for letter in AMINO_ACIDS:
@@ -119,7 +122,7 @@ def _get_norm_sum(normalized_dict):
 
     return norm_sum
 
-def _get_ratio(normalized_dict, norm_sum, PROTEIN_RATIO):
+def _get_ratio(normalized_dict, norm_sum, PROTEIN_RATIO, CELL_WEIGHT):
     # 2- Divide letter to norm_sum to get ratio of each amino acid in the cell
     # based on proteomic data
     ratio_dict = {'A': 0., 'C': 0., 'D': 0., 'E': 0., 'F': 0., 'G': 0., 'H': 0., 'I': 0.,
@@ -138,7 +141,8 @@ def _get_ratio(normalized_dict, norm_sum, PROTEIN_RATIO):
     return ratio_dict
 
 def _convert_to_coefficient(ratio_dict, path_to_model, CELL_WEIGHT):
-    model = import_model(path_to_model)
+    WATER_WEIGHT = 18.01528
+    model = _import_model(path_to_model)
     # 3- Convert gram ratios to mmol/g Dry weight
     '''
     To verify that the normalized to grams to get to the total amount of protein
@@ -161,7 +165,7 @@ def _convert_to_coefficient(ratio_dict, path_to_model, CELL_WEIGHT):
     # Get number of moles from number of grams
     for letter in AMINO_ACIDS:
         metab = letter_to_bigg.get(letter)
-        mol_weight = metab.formula_weight
+        mol_weight = metab.formula_weight - WATER_WEIGHT
         grams = ratio_dict.get(letter)
         mmols_per_cell = (grams / mol_weight) * 1000
         mmols_per_gDW = mmols_per_cell / CELL_WEIGHT
@@ -192,15 +196,17 @@ def generate_coefficients(path_to_genbank, path_to_model, path_to_proteomic, CEL
     """
     # Operations
     # 1- Parse the genome, extract protein sequence, count and store amino acid composition of each protein
+    if PROTEIN_RATIO > 1.:
+        print('Must enter ratio, value between 0. and 1.')
     seq_dict = _get_protein_sequence(path_to_genbank)
     list_of_dict = _get_aa_composition(seq_dict)
     normalized_dict = _normalize_aa_composition(list_of_dict,path_to_proteomic)
 
     # 2- Get coefficients from experimental proteomics data
     # Proteomics data should come in a 2 columns standard format protein_id:abundance
     norm_sum = _get_norm_sum(normalized_dict)
-    ratio_dict = get_ratio(normalized_dict, norm_sum, PROTEIN_RATIO)
-    biomass_coefficients = convert_to_coefficient(ratio_dict,path_to_model, CELL_WEIGHT)
+    ratio_dict = _get_ratio(normalized_dict, norm_sum, PROTEIN_RATIO, CELL_WEIGHT)
+    biomass_coefficients = _convert_to_coefficient(ratio_dict,path_to_model, CELL_WEIGHT)
 
     return biomass_coefficients
 

diff --git a/BOFdat/protein.pyc b/BOFdat/protein.pyc
diff --git a/BOFdat/rna.py b/BOFdat/rna.py
@@ -178,6 +178,7 @@ def _total_coefficients(mRNA_fractions, tRNA_fractions, rRNA_fractions, mRNA_RAT
 
 
 def _convert_to_mmolgDW(RNA_coefficients, model, RNA_RATIO, CELL_WEIGHT):
+    DIPHOSPHATE_WEIGHT = 174.951262
     # Get coefficients for BIOMASS
     # Transform the ratios into mmol/gDW
     RNA_WEIGHT = CELL_WEIGHT * RNA_RATIO
@@ -190,7 +191,7 @@ def _convert_to_mmolgDW(RNA_coefficients, model, RNA_RATIO, CELL_WEIGHT):
         ratio = RNA_coefficients.get(letter)
         total_weight = ratio * RNA_WEIGHT
         metab = rna_base_to_bigg.get(letter)
-        mol_weight = metab.formula_weight
+        mol_weight = metab.formula_weight - DIPHOSPHATE_WEIGHT
         mmols_per_cell = (total_weight / mol_weight) * 1000
         mmols_per_gDW = mmols_per_cell / CELL_WEIGHT
         coefficients.append(mmols_per_gDW)

diff --git a/BOFdat/rna.pyc b/BOFdat/rna.pyc
diff --git a/Example_usage/.~lock.lipidomic_conversion.csv# b/Example_usage/.~lock.lipidomic_conversion.csv#
@@ -0,0 +1 @@
+,jean-christophe,jeanchristophe-OptiPlex-7040,20.11.2017 14:45,file:///home/jean-christophe/.config/libreoffice/4;
diff --git a/Example_usage/.~lock.maintenance.csv# b/Example_usage/.~lock.maintenance.csv#
diff --git a/Example_usage/.~lock.new_lipidomic_abundances.csv# b/Example_usage/.~lock.new_lipidomic_abundances.csv#
@@ -0,0 +1 @@
+,jean-christophe,jeanchristophe-OptiPlex-7040,20.11.2017 18:23,file:///home/jean-christophe/.config/libreoffice/4;
diff --git a/Example_usage/.~lock.new_lipidomic_conversion.csv# b/Example_usage/.~lock.new_lipidomic_conversion.csv#
@@ -0,0 +1 @@
+,jean-christophe,jeanchristophe-OptiPlex-7040,20.11.2017 18:25,file:///home/jean-christophe/.config/libreoffice/4;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,jean-christophe,jeanchristophe-OptiPlex-7040,20.11.2017 14:45,file:///home/jean-christophe/.config/libreoffice/4;