From fe9ceb90e2d9676e8ac815f954c728bed73bc7ef Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Sat, 12 Aug 2017 07:12:28 +0200 Subject: [PATCH 1/9] save --- analysis/data_sets_I/run_analysis.py | 16 +++++----- .../data_sets_I/calculate_scored_lists_RF.py | 32 +++++++++---------- .../data_sets_II/calculate_scored_lists_RF.py | 28 ++++++++-------- scoring/fingerprint_lib.py | 4 +-- .../calculate_validation_methods.py | 24 +++++++------- 5 files changed, 52 insertions(+), 52 deletions(-) diff --git a/analysis/data_sets_I/run_analysis.py b/analysis/data_sets_I/run_analysis.py index ad4215d..456e9f8 100644 --- a/analysis/data_sets_I/run_analysis.py +++ b/analysis/data_sets_I/run_analysis.py @@ -17,19 +17,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -101,7 +101,7 @@ inpath = tmppath+'/'+dataset # loop over targets - for target in conf.set_data: + for target in conf.set_data[dataset]['ids']: print target # load results diff --git a/scoring/data_sets_I/calculate_scored_lists_RF.py b/scoring/data_sets_I/calculate_scored_lists_RF.py index 1f5c74e..3736d77 100644 --- a/scoring/data_sets_I/calculate_scored_lists_RF.py +++ b/scoring/data_sets_I/calculate_scored_lists_RF.py @@ -9,8 +9,8 @@ # optional: # -o [] : relative output path (default: pwd) # -a : append to the output file (default: overwrite) -# -s [] : similarity metric (default: Dice, -# other options: Tanimoto, Cosine, Russel, Kulczynski, +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, # McConnaughey, Manhattan, RogotGoldberg) # -r [] : file containing the random forest info # default parameters: criterion=gini, max_depth=10, @@ -24,19 +24,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -55,7 +55,7 @@ from rdkit import Chem, DataStructs import cPickle, gzip, sys, os, os.path, numpy from collections import defaultdict -from optparse import OptionParser +from optparse import OptionParser from sklearn.ensemble import RandomForestClassifier, forest from sklearn.tree import tree from rdkit.ML.Data import DataUtils @@ -92,7 +92,7 @@ read_dict['min_samples_leaf'] = lambda x: int(x) read_dict['n_jobs'] = lambda x: int(x) -forest._parallel_build_trees = ml_func._balanced_parallel_build_trees +#forest._parallel_build_trees = ml_func._balanced_parallel_build_trees # prepare command-line option parser usage = "usage: %prog [options] arg" @@ -110,7 +110,7 @@ # read in command line options (options, args) = parser.parse_args() # required arguments - if options.num and options.fp: + if options.num and options.fp: num_query_mols = options.num fp_build = options.fp else: @@ -133,7 +133,7 @@ scor.checkQueryMols(num_query_mols, conf.list_num_query_mols) # default machine-learning method variables - ml_dict = dict(criterion='gini', max_features='auto', n_jobs=1, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) + ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) if options.ml: ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) @@ -150,7 +150,7 @@ # read in actives and calculate fps actives = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'): - if line[0] != '#': + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -166,7 +166,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'): - if line[0] != '#': + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -178,7 +178,7 @@ else: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'): - if line[0] != '#': + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py index f060364..a9b7e2b 100644 --- a/scoring/data_sets_II/calculate_scored_lists_RF.py +++ b/scoring/data_sets_II/calculate_scored_lists_RF.py @@ -8,8 +8,8 @@ # optional: # -o [] : relative output path (default: pwd) # -a : append to the output file (default: overwrite) -# -s [] : similarity metric (default: Dice, -# other options: Tanimoto, Cosine, Russel, Kulczynski, +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, # McConnaughey, Manhattan, RogotGoldberg) # -r [] : file containing the random forest info # default parameters: criterion=gini, max_depth=10, @@ -23,19 +23,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -54,7 +54,7 @@ from rdkit import Chem, DataStructs import cPickle, gzip, sys, os, os.path, numpy from collections import defaultdict -from optparse import OptionParser +from optparse import OptionParser from sklearn.ensemble import RandomForestClassifier, forest from sklearn.tree import tree from rdkit.ML.Data import DataUtils @@ -91,7 +91,7 @@ read_dict['min_samples_leaf'] = lambda x: int(x) read_dict['n_jobs'] = lambda x: int(x) -forest._parallel_build_trees = ml_func._balanced_parallel_build_trees +# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees # prepare command-line option parser usage = "usage: %prog [options] arg" @@ -108,7 +108,7 @@ # read in command line options (options, args) = parser.parse_args() # required arguments - if options.fp: + if options.fp: fp_build = options.fp else: raise RuntimeError('one or more of the required options was not given!') @@ -150,7 +150,7 @@ # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): - if line[0] != '#': + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -164,7 +164,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): - if line[0] != '#': + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py index 9c7b606..c5a5fda 100644 --- a/scoring/fingerprint_lib.py +++ b/scoring/fingerprint_lib.py @@ -12,7 +12,7 @@ from rdkit.Chem import rdMolDescriptors # implemented fingerprints: -# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), +# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt) # hashed atom pairs (hashap), hashed topological torsions (hashtt) --> with 1024 bits # ECFP4 (ecfp4), ECFP6 (ecfp6), ECFC4 (ecfc4), ECFC6 (ecfc6) --> with 1024 bits @@ -23,7 +23,7 @@ # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7) # 2D pharmacophore (pharm) ????????????? -nbits = 1024 +nbits = 2048 longbits = 16384 # dictionary diff --git a/validation/data_sets_I/calculate_validation_methods.py b/validation/data_sets_I/calculate_validation_methods.py index 794e53d..d4402cf 100644 --- a/validation/data_sets_I/calculate_validation_methods.py +++ b/validation/data_sets_I/calculate_validation_methods.py @@ -7,7 +7,7 @@ # # INPUT # required: -# -m [] : file containing the methods +# -m [] : file containing the methods # implemented methods are: AUC, BEDROC ([alpha] optional), # RIE ([alpha] optional), EF ([percentage] optional) # optional: @@ -23,19 +23,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -94,11 +94,11 @@ # optional arguments inpath = parentpath+'scoring/' - if options.inpath: + if options.inpath: inpath = [path+i for i in options.inpath] vfunc.checkPaths(inpath) outpath = path - if options.outpath: + if options.outpath: outpath = path+options.outpath vfunc.checkPaths([outpath]) remove_fps = [] @@ -125,7 +125,7 @@ # load scored lists scores = {} for inp in inpath: # loop over input paths - myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'r') + myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'r') while 1: try: tmp = cPickle.load(myfile) @@ -150,7 +150,7 @@ # loop of repetitions for q in range(conf.num_reps): # loop over evaluation methods - for m in method_dict.keys(): + for m in method_dict.keys(): method_dict[m].runMethod(results, scores, q, -1) print "validation methods calculated" From f0763b26be8f6029316f43c0ec867d84ee3f6cf7 Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Tue, 10 Oct 2017 04:23:51 +0200 Subject: [PATCH 2/9] get most everything working with python3 --- analysis/data_sets_I/run_analysis.py | 12 ++--- analysis/data_sets_I/run_fp_summary.py | 22 ++++---- analysis/data_sets_I/run_method_summary.py | 22 ++++---- analysis/data_sets_II/run_analysis.py | 22 ++++---- analysis/data_sets_II/run_fp_summary.py | 20 ++++---- analysis/data_sets_II/run_method_summary.py | 18 +++---- scoring/data_sets_I/apply_fusion.py | 36 ++++++------- scoring/data_sets_I/calculate_scored_lists.py | 49 +++++++++--------- .../data_sets_I/calculate_scored_lists_LR.py | 49 +++++++++--------- .../data_sets_I/calculate_scored_lists_RF.py | 21 ++++---- .../data_sets_II/calculate_scored_lists_LR.py | 50 ++++++++++--------- .../data_sets_II/calculate_scored_lists_RF.py | 26 +++++----- scoring/scoring_functions.py | 26 +++++----- .../calculate_validation_methods.py | 18 +++---- .../calculate_validation_methods.py | 40 +++++++-------- validation/validation_functions.py | 33 ++++++------ 16 files changed, 238 insertions(+), 226 deletions(-) diff --git a/analysis/data_sets_I/run_analysis.py b/analysis/data_sets_I/run_analysis.py index 456e9f8..6ae1f8d 100644 --- a/analysis/data_sets_I/run_analysis.py +++ b/analysis/data_sets_I/run_analysis.py @@ -45,7 +45,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, math, sys, os, os.path +import gzip, pickle, math, sys, os, os.path import numpy as np from scipy import special, stats from collections import defaultdict @@ -92,7 +92,7 @@ # loop over dataset sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # output directories and input directory outdir = outpath+'/'+dataset if not os.path.exists(outdir): os.makedirs(outdir) @@ -102,12 +102,12 @@ # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # load results - validation = cPickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'r')) - methodkeys = validation.keys() - fpkeys = validation[methodkeys[0]].keys() + validation = pickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'rb')) + methodkeys = list(validation.keys()) + fpkeys = list(validation[methodkeys[0]].keys()) # if ranks is not yet set: prepare it if len(ranks) == 0: diff --git a/analysis/data_sets_I/run_fp_summary.py b/analysis/data_sets_I/run_fp_summary.py index 085304c..d5f4d48 100644 --- a/analysis/data_sets_I/run_fp_summary.py +++ b/analysis/data_sets_I/run_fp_summary.py @@ -15,19 +15,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -43,7 +43,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, sys, os, os.path +import gzip, pickle, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -81,17 +81,17 @@ # loop over dataset sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # input path inpath = outpath+'/'+dataset # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # load results results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r')) - methodkeys = results.keys() + methodkeys = list(results.keys()) # if summary is not yet set: prepare it if len(summary) == 0: diff --git a/analysis/data_sets_I/run_method_summary.py b/analysis/data_sets_I/run_method_summary.py index c12e89e..048ad34 100644 --- a/analysis/data_sets_I/run_method_summary.py +++ b/analysis/data_sets_I/run_method_summary.py @@ -15,19 +15,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -43,7 +43,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, sys, os, os.path +import gzip, pickle, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -81,17 +81,17 @@ # loop over dataset sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # input directory inpath = outpath+'/'+dataset # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # load results results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r')) - methodkeys = results.keys() + methodkeys = list(results.keys()) # if summary is not yet set: prepare it if len(summary) == 0: diff --git a/analysis/data_sets_II/run_analysis.py b/analysis/data_sets_II/run_analysis.py index be0dc42..9a69177 100644 --- a/analysis/data_sets_II/run_analysis.py +++ b/analysis/data_sets_II/run_analysis.py @@ -17,19 +17,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -45,7 +45,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, math, sys, os, os.path +import gzip, pickle, math, sys, os, os.path import numpy as np from scipy import special, stats from collections import defaultdict @@ -99,11 +99,11 @@ # loop over targets for target in conf.set_data: - print target + print( target) # load results - validation = cPickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'r')) - methodkeys = validation.keys() + validation = pickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'rb')) + methodkeys = list(validation.keys()) fpkeys = validation[methodkeys[0]].keys() # if ranks is not yet set: prepare it diff --git a/analysis/data_sets_II/run_fp_summary.py b/analysis/data_sets_II/run_fp_summary.py index b28fe8a..8580cf5 100644 --- a/analysis/data_sets_II/run_fp_summary.py +++ b/analysis/data_sets_II/run_fp_summary.py @@ -15,19 +15,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -43,7 +43,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, sys, os, os.path +import gzip, pickle, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -84,11 +84,11 @@ # loop over targets for target in conf.set_data: - print target + print( target) # load results results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r')) - methodkeys = results.keys() + methodkeys = list(results.keys()) # if summary is not yet set: prepare it if len(summary) == 0: diff --git a/analysis/data_sets_II/run_method_summary.py b/analysis/data_sets_II/run_method_summary.py index 66154ee..949e076 100644 --- a/analysis/data_sets_II/run_method_summary.py +++ b/analysis/data_sets_II/run_method_summary.py @@ -15,19 +15,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -43,7 +43,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, sys, os, os.path +import gzip, pickle, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -84,7 +84,7 @@ # loop over targets for target in conf.set_data: - print target + print( target) # load results results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r')) diff --git a/scoring/data_sets_I/apply_fusion.py b/scoring/data_sets_I/apply_fusion.py index 7749be8..9f307b3 100644 --- a/scoring/data_sets_I/apply_fusion.py +++ b/scoring/data_sets_I/apply_fusion.py @@ -2,7 +2,7 @@ # $Id$ # # loads ranked lists from different -# models and/or fingerprints and +# models and/or fingerprints and # apply rank-based fusion # # INPUT @@ -22,19 +22,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -50,7 +50,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, math, sys, os, os.path +import gzip, pickle, math, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -101,7 +101,7 @@ if options.rm_file: remove_fps = scor.readFPs(path+options.rm_file) outpath = path - if options.outpath: + if options.outpath: outpath = path+options.outpath scor.checkPath(outpath, 'output') do_append = False @@ -112,19 +112,19 @@ # loop over data-set sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # load scored lists scores = {} for inp in inpath: # loop over input paths - myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'r') + myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'rb') while 1: try: - tmp = cPickle.load(myfile) + tmp = pickle.load(myfile) except (EOFError): break else: @@ -133,9 +133,9 @@ tmp[0] = scor.getName(tmp[0], scores.keys()) # input line: [fp_name, list of scored lists] scores[tmp[0]] = tmp[1] - print "scored lists read in" + print( "scored lists read in") if len(scores.keys()) < 2: - print "number of fingerprints/models < 2, nothing to be done" + print( "number of fingerprints/models < 2, nothing to be done") break if printfp: # determine the name of the fusion @@ -176,6 +176,6 @@ outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'_'+'.pkl.gz', 'ab+') # binary format else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'_'+'.pkl.gz', 'wb+') # binary format - cPickle.dump([fpname, new_scores], outfile, 2) + pickle.dump([fpname, new_scores], outfile, 2) outfile.close() - print "fusion ranking done and ranked list written" + print( "fusion ranking done and ranked list written") diff --git a/scoring/data_sets_I/calculate_scored_lists.py b/scoring/data_sets_I/calculate_scored_lists.py index 415c8f4..4c1f6ec 100644 --- a/scoring/data_sets_I/calculate_scored_lists.py +++ b/scoring/data_sets_I/calculate_scored_lists.py @@ -9,8 +9,8 @@ # optional: # -o [] : relative output path (default: pwd) # -a : append to the output file (default: overwrite) -# -s [] : similarity metric (default: Dice, -# other options: Tanimoto, Cosine, Russel, Kulczynski, +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, # McConnaughey, Manhattan, RogotGoldberg) # --help : prints usage # @@ -20,19 +20,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -49,9 +49,9 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path +import pickle, gzip, sys, os, os.path from collections import defaultdict -from optparse import OptionParser +from optparse import OptionParser # import configuration file with global variables sys.path.insert(0, os.getcwd()+'/../../') @@ -86,7 +86,7 @@ # read in command line options (options, args) = parser.parse_args() # required arguments - if options.num and options.fp_file: + if options.num and options.fp_file: num_query_mols = options.num fp_file = path+options.fp_file else: @@ -112,15 +112,16 @@ # loop over data-set sources for dataset in conf.set_data.keys(): - print dataset + print(dataset) # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # read in actives and calculate fps actives = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFPDict(fp_names, line[2]) @@ -134,7 +135,8 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFPDict(fp_names, line[2]) @@ -144,23 +146,24 @@ else: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFPDict(fp_names, line[2]) # store: [internal ID, dict with fps] decoys.append([line[1], fp_dict]) num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print( "molecules read in and fingerprints calculated") # open training lists - training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r') + training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in range(conf.num_reps): - training_list = cPickle.load(training_input) + training_list = pickle.load(training_input) test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]] test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]] # loop over fps @@ -174,7 +177,7 @@ tmp_score = scor.getBulkSimilarity(tmp_mol[1], query_fps, simil_metric) # use max fusion # store : [similarity, internal ID, active/inactive] - single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]]) + single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]]) # rank list according to similarity scores[fp].append(sorted(single_score[fp], reverse=True)) @@ -184,6 +187,6 @@ else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in fp_names: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print( "scoring done and scored lists written") diff --git a/scoring/data_sets_I/calculate_scored_lists_LR.py b/scoring/data_sets_I/calculate_scored_lists_LR.py index 4a7e804..81cd286 100644 --- a/scoring/data_sets_I/calculate_scored_lists_LR.py +++ b/scoring/data_sets_I/calculate_scored_lists_LR.py @@ -9,8 +9,8 @@ # optional: # -o [] : relative output path (default: pwd) # -a : append to the output file (default: overwrite) -# -s [] : similarity metric (default: Dice, -# other options: Tanimoto, Cosine, Russel, Kulczynski, +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, # McConnaughey, Manhattan, RogotGoldberg) # -r [] : file containing the logistic regression info # default parameters: penalty='l2', dual=0 (false), C=1.0, @@ -24,19 +24,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -53,9 +53,9 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path, numpy +import pickle, gzip, sys, os, os.path, numpy from collections import defaultdict -from optparse import OptionParser +from optparse import OptionParser from sklearn.linear_model import LogisticRegression # import configuration file with global variables @@ -105,7 +105,7 @@ # read in command line options (options, args) = parser.parse_args() # required arguments - if options.num and options.fp: + if options.num and options.fp: num_query_mols = options.num fp_build = options.fp else: @@ -137,15 +137,16 @@ # loop over data-set sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # read in actives and calculate fps actives = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -161,7 +162,8 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -173,7 +175,8 @@ else: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -182,17 +185,17 @@ # convert fps to numpy arrays np_fps_dcy = ml_func.getNumpy(decoys) num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print( "molecules read in and fingerprints calculated") # open training lists - training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r') + training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in range(conf.num_reps): - print q - training_list = cPickle.load(training_input) + print( q) + training_list = pickle.load(training_input) test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]] test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]] @@ -223,6 +226,6 @@ else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['lr_'+fp_build]: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print( "scoring done and scored lists written") diff --git a/scoring/data_sets_I/calculate_scored_lists_RF.py b/scoring/data_sets_I/calculate_scored_lists_RF.py index 3736d77..74c1c29 100644 --- a/scoring/data_sets_I/calculate_scored_lists_RF.py +++ b/scoring/data_sets_I/calculate_scored_lists_RF.py @@ -53,7 +53,7 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path, numpy +import pickle, gzip, sys, os, os.path, numpy from collections import defaultdict from optparse import OptionParser from sklearn.ensemble import RandomForestClassifier, forest @@ -142,14 +142,15 @@ # loop over data-set sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # read in actives and calculate fps actives = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -166,6 +167,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -178,6 +180,7 @@ else: decoys = [] for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -187,17 +190,17 @@ # convert fps to numpy arrays np_fps_dcy = ml_func.getNumpy(decoys) num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print( "molecules read in and fingerprints calculated") # open training lists - training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r') + training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in range(conf.num_reps): - print q - training_list = cPickle.load(training_input) + print( q) + training_list = pickle.load(training_input) test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]] test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]] @@ -238,6 +241,6 @@ else: outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['rf_'+fp_build]: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print( "scoring done and scored lists written") diff --git a/scoring/data_sets_II/calculate_scored_lists_LR.py b/scoring/data_sets_II/calculate_scored_lists_LR.py index 12df4a8..f1f1e63 100644 --- a/scoring/data_sets_II/calculate_scored_lists_LR.py +++ b/scoring/data_sets_II/calculate_scored_lists_LR.py @@ -8,8 +8,8 @@ # optional: # -o [] : relative output path (default: pwd) # -a : append to the output file (default: overwrite) -# -s [] : similarity metric (default: Dice, -# other options: Tanimoto, Cosine, Russel, Kulczynski, +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, # McConnaughey, Manhattan, RogotGoldberg) # -r [] : file containing the logistic regression info # default parameters: penalty='l2', dual=0 (false), C=1.0, @@ -23,19 +23,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -52,9 +52,9 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path, numpy +import pickle, gzip, sys, os, os.path, numpy from collections import defaultdict -from optparse import OptionParser +from optparse import OptionParser from sklearn.linear_model import LogisticRegression # import configuration file with global variables @@ -103,7 +103,7 @@ # read in command line options (options, args) = parser.parse_args() # required arguments - if options.fp: + if options.fp: fp_build = options.fp else: raise RuntimeError('one or more of the required options was not given!') @@ -133,10 +133,10 @@ # loop over targets for target in conf.set_data: - print target + print( target) # read in training actives and calculate fps - actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r')) + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) for k in actives.keys(): for i,m in enumerate(actives[k]): fp_dict = scor.getFP(fp_build, m[1]) @@ -145,7 +145,8 @@ # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -159,7 +160,8 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): - if line[0] != '#': + line=line.decode('UTF-8') + if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) @@ -169,20 +171,20 @@ np_fps_dcy = ml_func.getNumpy(decoys) firstchembl = False num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print( "molecules read in and fingerprints calculated") # open training and test lists - training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r') - test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r') + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in actives.keys(): - print q + print( q) num_actives = len(actives[q]) np_fps_act = ml_func.getNumpy(actives[q]) - training_list = cPickle.load(training_input) - test_list = cPickle.load(test_input) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] # list with active/inactive info @@ -211,6 +213,6 @@ else: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['lr_'+fp_build]: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print( "scoring done and scored lists written") diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py index a9b7e2b..e40d390 100644 --- a/scoring/data_sets_II/calculate_scored_lists_RF.py +++ b/scoring/data_sets_II/calculate_scored_lists_RF.py @@ -52,7 +52,7 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path, numpy +import pickle, gzip, sys, os, os.path, numpy from collections import defaultdict from optparse import OptionParser from sklearn.ensemble import RandomForestClassifier, forest @@ -129,7 +129,7 @@ scor.checkSimil(simil_metric) # default machine-learning method variables - ml_dict = dict(criterion='gini', max_features='auto', n_jobs=1, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) + ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) if options.ml: ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) @@ -138,10 +138,10 @@ # loop over targets for target in conf.set_data: - print target + print( target) # read in training actives and calculate fps - actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r')) + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) for k in actives.keys(): for i,m in enumerate(actives[k]): fp_dict = scor.getFP(fp_build, m[1]) @@ -150,6 +150,7 @@ # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -164,6 +165,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -174,20 +176,20 @@ np_fps_dcy = ml_func.getNumpy(decoys) firstchembl = False num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print( "molecules read in and fingerprints calculated") # open training and test lists - training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r') - test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r') + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in actives.keys(): - print q + print( q) num_actives = len(actives[q]) np_fps_act = ml_func.getNumpy(actives[q]) - training_list = cPickle.load(training_input) - test_list = cPickle.load(test_input) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] # list with active/inactive info @@ -226,6 +228,6 @@ else: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['rf_'+fp_build]: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print( "scoring done and scored lists written") diff --git a/scoring/scoring_functions.py b/scoring/scoring_functions.py index c7bdc6e..a1ec5a6 100644 --- a/scoring/scoring_functions.py +++ b/scoring/scoring_functions.py @@ -6,19 +6,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -100,13 +100,13 @@ def getBulkSimilarity(fp, fp_list, simil): # helper functions for the fusion def printFPs(fps, fpname): '''Prints a list of fingerprints''' - print "-------------------------------" - print "FUSION DONE FOR:" + print( "-------------------------------") + print( "FUSION DONE FOR:") for fp in fps: - print fp, - print "" - print "Name of fusion:", fpname - print "-------------------------------" + print( fp,) + print( "") + print( "Name of fusion:", fpname) + print( "-------------------------------") def getName(fp, fp_names): '''Determines the new name of a fingerprint in case diff --git a/validation/data_sets_I/calculate_validation_methods.py b/validation/data_sets_I/calculate_validation_methods.py index d4402cf..4b2085d 100644 --- a/validation/data_sets_I/calculate_validation_methods.py +++ b/validation/data_sets_I/calculate_validation_methods.py @@ -51,7 +51,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, math, sys, os, os.path +import gzip, pickle, math, sys, os, os.path from collections import defaultdict from optparse import OptionParser from rdkit.ML.Scoring import Scoring @@ -113,22 +113,22 @@ # loop over data-set sources for dataset in conf.set_data.keys(): - print dataset + print( dataset) # output directory outdir = outpath+'/'+dataset if not os.path.exists(outdir): os.makedirs(outdir) # loop over targets for target in conf.set_data[dataset]['ids']: - print target + print( target) # load scored lists scores = {} for inp in inpath: # loop over input paths - myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'r') + myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'rb') while 1: try: - tmp = cPickle.load(myfile) + tmp = pickle.load(myfile) except (EOFError): break else: @@ -137,7 +137,7 @@ tmp[0] = vfunc.getName(tmp[0], scores.keys()) # input line: [fp_name, list of scored lists] scores[tmp[0]] = tmp[1] - print "scored lists read in" + print( "scored lists read in") if printfp: vfunc.printFPs(scores.keys()) printfp = False @@ -153,11 +153,11 @@ for m in method_dict.keys(): method_dict[m].runMethod(results, scores, q, -1) - print "validation methods calculated" + print( "validation methods calculated") # write results outf = gzip.open(outdir+'/validation_'+str(target)+'.pkl.gz', 'wb+') - cPickle.dump(results, outf, 2) + pickle.dump(results, outf, 2) outf.close() - print "results written out" + print( "results written out") diff --git a/validation/data_sets_II/calculate_validation_methods.py b/validation/data_sets_II/calculate_validation_methods.py index f370c1b..da6b3a6 100644 --- a/validation/data_sets_II/calculate_validation_methods.py +++ b/validation/data_sets_II/calculate_validation_methods.py @@ -7,7 +7,7 @@ # # INPUT # required: -# -m [] : file containing the methods +# -m [] : file containing the methods # implemented methods are: AUC, BEDROC ([alpha] optional), # RIE ([alpha] optional), EF ([percentage] optional) # optional: @@ -23,19 +23,19 @@ # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are -# met: +# met: # -# * Redistributions of source code must retain the above copyright +# * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided # with the distribution. -# * Neither the name of Novartis Institutes for BioMedical Research Inc. -# nor the names of its contributors may be used to endorse or promote +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS @@ -51,7 +51,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -import gzip, cPickle, math, sys, os, os.path +import gzip, pickle, math, sys, os, os.path from collections import defaultdict from optparse import OptionParser from rdkit.ML.Scoring import Scoring @@ -94,11 +94,11 @@ # optional arguments inpath = parentpath+'scoring/' - if options.inpath: + if options.inpath: inpath = [path+i for i in options.inpath] vfunc.checkPaths(inpath) outpath = path - if options.outpath: + if options.outpath: outpath = path+options.outpath vfunc.checkPaths([outpath]) remove_fps = [] @@ -117,15 +117,15 @@ # loop over targets for target in conf.set_data: - print target + print( target) # load scored lists scores = {} for inp in inpath: # loop over input paths - myfile = gzip.open(inp+'/list_'+str(target)+'.pkl.gz', 'r') + myfile = gzip.open(inp+'/list_'+str(target)+'.pkl.gz', 'rb') while 1: try: - tmp = cPickle.load(myfile) + tmp = pickle.load(myfile) except (EOFError): break else: @@ -134,7 +134,7 @@ tmp[0] = vfunc.getName(tmp[0], scores.keys()) # input line: [fp_name, list of scored lists] scores[tmp[0]] = tmp[1] - print "scored lists read in" + print( "scored lists read in") if printfp: vfunc.printFPs(scores.keys()) printfp = False @@ -145,16 +145,16 @@ method_dict[m].addNames(results) # loop of papers - for q in range(len(scores[scores.keys()[0]])): + for q in range(len(list(scores.values())[0])): # loop over evaluation methods - for m in method_dict.keys(): + for m in method_dict.keys(): method_dict[m].runMethod(results, scores, q, -1) - print "validation methods calculated" + print( "validation methods calculated") # write results outf = gzip.open(outdir+'/validation_'+str(target)+'.pkl.gz', 'wb+') - cPickle.dump(results, outf, 2) + pickle.dump(results, outf, 2) outf.close() - print "results written out" + print( "results written out") diff --git a/validation/validation_functions.py b/validation/validation_functions.py index 89fd19f..e8ffd21 100644 --- a/validation/validation_functions.py +++ b/validation/validation_functions.py @@ -91,28 +91,28 @@ def readFPs(filepath): def printInputParam(method_dict, inpath): '''Prints the input parameters''' - print "-------------------------------" - print "PARAMETERS USED" - print "Validation methods: " + print( "-------------------------------") + print( "PARAMETERS USED") + print( "Validation methods: ") for m in method_dict.keys(): if isinstance(method_dict[m], ParamEvalMethod): - print m, "- parameters:", method_dict[m].params + print( m, "- parameters:", method_dict[m].params) else: - print m - print "" - print "Input paths:" + print( m) + print( "") + print( "Input paths:") for inp in inpath: - print inp - print "-------------------------------" + print( inp) + print( "-------------------------------") def printFPs(fps): '''Prints a list of fingerprints''' - print "-------------------------------" - print "FINGERPRINTS CONSIDERED" + print( "-------------------------------") + print( "FINGERPRINTS CONSIDERED") for fp in fps: - print fp, - print "" - print "-------------------------------" + print( " ",fp) + print( "") + print( "-------------------------------") def getName(fp, fp_names): '''Determines the new name of a fingerprint in case @@ -157,12 +157,12 @@ def __init__(self, name, params, factor): for p in self.params: self.names.append(name + str(int(factor*p))) def addNames(self, results): - for n in self.names: + for n in self.names: results[n] = defaultdict(list) def runMethod(self, results, scores, query, index): tmp_list = [[] for i in range(len(self.names))] # loop over fingerprints - for k in scores.keys(): + for k in scores.keys(): tmp = self.calculate(scores[k][query], index) # loop over parameters for i in range(len(self.names)): @@ -193,4 +193,3 @@ def calculate(self, score, index): for p in self.params: tmp.append(Scoring.CalcRIE(score,index,p)) return tmp - From 4029f33cefd3f20dd3ffa5c787d60c149343cf19 Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Tue, 10 Oct 2017 05:52:19 +0200 Subject: [PATCH 3/9] add reversible (Crude!) --- scoring/fingerprint_lib.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py index c5a5fda..cd24a07 100644 --- a/scoring/fingerprint_lib.py +++ b/scoring/fingerprint_lib.py @@ -11,6 +11,43 @@ from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory from rdkit.Chem import rdMolDescriptors +import pickle +from rdkit.Chem import BRICS + +fragmentList = pickle.load(open('/home/glandrum/Projects/reversible_fingerprints/data/frags.min2.ordered.pkl','rb')) +from collections import Counter,defaultdict +def generateFragmentFingerprint(mol,nBits=4096,fragmentList=fragmentList): + frags = BRICS.BRICSDecompose(mol,minFragmentSize=2) + res = Counter() + for frag in frags: + try: + idx = fragmentList.index(frag) + except ValueError: + continue + res[idx%nBits] += 1 + return res +def FoldedRDKFingerprintCountBased(mol,fpSize=1024,**kwargs): + bitInfo = {} + unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol,branchedPaths=False,minPath=3,maxPath=3,bitInfo=bitInfo, + **kwargs) + res = {} + newBitInfo = defaultdict(list) + for k,b in unfolded.GetNonzeroElements().items(): + res[k%fpSize] = b + newBitInfo[k%fpSize].extend(bitInfo[k]) + return res,newBitInfo + +from rdkit import DataStructs +def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024): + res = DataStructs.UIntSparseIntVect(nFragmentBits+nRDKitBits) + fragfp = generateFragmentFingerprint(mol,nBits=nFragmentBits) + rdkfp,_ = FoldedRDKFingerprintCountBased(mol,fpSize=nRDKitBits) + for bit,count in fragfp.items(): + res[bit] = count + for bit,count in rdkfp.items(): + res[bit+nFragmentBits] = count + return res + # implemented fingerprints: # ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt) @@ -28,6 +65,7 @@ # dictionary fpdict = {} +fpdict['reversible'] = GetReversibleFingerprint fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 0, nBits=nbits) fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=nbits) fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits) From 84d67e7be18d8d0b9a59a7e97f1c3348b8414751 Mon Sep 17 00:00:00 2001 From: greg landrum Date: Sat, 26 Nov 2022 05:50:26 +0100 Subject: [PATCH 4/9] update --- scoring/fingerprint_lib.py | 115 ++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py index cd24a07..ac22b00 100644 --- a/scoring/fingerprint_lib.py +++ b/scoring/fingerprint_lib.py @@ -11,42 +11,22 @@ from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory from rdkit.Chem import rdMolDescriptors -import pickle -from rdkit.Chem import BRICS -fragmentList = pickle.load(open('/home/glandrum/Projects/reversible_fingerprints/data/frags.min2.ordered.pkl','rb')) -from collections import Counter,defaultdict -def generateFragmentFingerprint(mol,nBits=4096,fragmentList=fragmentList): - frags = BRICS.BRICSDecompose(mol,minFragmentSize=2) - res = Counter() - for frag in frags: - try: - idx = fragmentList.index(frag) - except ValueError: - continue - res[idx%nBits] += 1 - return res -def FoldedRDKFingerprintCountBased(mol,fpSize=1024,**kwargs): +def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs): bitInfo = {} - unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol,branchedPaths=False,minPath=3,maxPath=3,bitInfo=bitInfo, + unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol, + branchedPaths=False, + minPath=3, + maxPath=3, + bitInfo=bitInfo, **kwargs) res = {} newBitInfo = defaultdict(list) - for k,b in unfolded.GetNonzeroElements().items(): - res[k%fpSize] = b - newBitInfo[k%fpSize].extend(bitInfo[k]) - return res,newBitInfo + for k, b in unfolded.GetNonzeroElements().items(): + res[k % fpSize] = b + newBitInfo[k % fpSize].extend(bitInfo[k]) + return res, newBitInfo -from rdkit import DataStructs -def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024): - res = DataStructs.UIntSparseIntVect(nFragmentBits+nRDKitBits) - fragfp = generateFragmentFingerprint(mol,nBits=nFragmentBits) - rdkfp,_ = FoldedRDKFingerprintCountBased(mol,fpSize=nRDKitBits) - for bit,count in fragfp.items(): - res[bit] = count - for bit,count in rdkfp.items(): - res[bit+nFragmentBits] = count - return res # implemented fingerprints: # ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), @@ -60,45 +40,72 @@ def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024): # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7) # 2D pharmacophore (pharm) ????????????? -nbits = 2048 +nbits = 1024 longbits = 16384 # dictionary fpdict = {} -fpdict['reversible'] = GetReversibleFingerprint -fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 0, nBits=nbits) -fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=nbits) -fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits) -fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=nbits) +fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 0, nBits=nbits) +fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 1, nBits=nbits) +fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 2, nBits=nbits) +fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 3, nBits=nbits) fpdict['ecfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0) fpdict['ecfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1) fpdict['ecfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2) fpdict['ecfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3) -fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, useFeatures=True, nBits=nbits) -fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=nbits) -fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=nbits) -fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1, useFeatures=True) -fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2, useFeatures=True) -fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3, useFeatures=True) -fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=longbits) -fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=longbits) -fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=longbits) -fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=longbits) +fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 1, useFeatures=True, nBits=nbits) +fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 2, useFeatures=True, nBits=nbits) +fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 3, useFeatures=True, nBits=nbits) +fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint( + m, 1, useFeatures=True) +fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint( + m, 2, useFeatures=True) +fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint( + m, 3, useFeatures=True) +fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 2, nBits=longbits) +fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 3, nBits=longbits) +fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 2, useFeatures=True, nBits=longbits) +fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( + m, 3, useFeatures=True, nBits=longbits) fpdict['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m) -fpdict['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m) -fpdict['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) -fpdict['hashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(m, nBits=nbits) -fpdict['hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits=nbits) +fpdict['ap'] = lambda m: rdMolDescriptors.GetAtomPairFingerprint(m) +fpdict['tt'] = lambda m: rdMolDescriptors.GetTopologicalTorsionFingerprint(m) +fpdict[ + 'hashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( + m, nBits=nbits) +fpdict[ + 'hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( + m, nBits=nbits) fpdict['avalon'] = lambda m: fpAvalon.GetAvalonFP(m, nbits) fpdict['laval'] = lambda m: fpAvalon.GetAvalonFP(m, longbits) -fpdict['rdk5'] = lambda m: Chem.RDKFingerprint(m, maxPath=5, fpSize=nbits, nBitsPerHash=2) -fpdict['rdk6'] = lambda m: Chem.RDKFingerprint(m, maxPath=6, fpSize=nbits, nBitsPerHash=2) -fpdict['rdk7'] = lambda m: Chem.RDKFingerprint(m, maxPath=7, fpSize=nbits, nBitsPerHash=2) +fpdict['rdk5'] = lambda m: Chem.RDKFingerprint( + m, maxPath=5, fpSize=nbits, nBitsPerHash=2) +fpdict['rdk6'] = lambda m: Chem.RDKFingerprint( + m, maxPath=6, fpSize=nbits, nBitsPerHash=2) +fpdict['rdk7'] = lambda m: Chem.RDKFingerprint( + m, maxPath=7, fpSize=nbits, nBitsPerHash=2) +fpdict['lrdk5'] = lambda m: Chem.RDKFingerprint( + m, maxPath=5, fpSize=longbits, nBitsPerHash=2) +fpdict['lrdk6'] = lambda m: Chem.RDKFingerprint( + m, maxPath=6, fpSize=longbits, nBitsPerHash=2) +fpdict['lrdk7'] = lambda m: Chem.RDKFingerprint( + m, maxPath=7, fpSize=longbits, nBitsPerHash=2) def CalculateFP(fp_name, smiles): m = Chem.MolFromSmiles(smiles) if m is None: - raise ValueError('SMILES cannot be converted to a RDKit molecules:', smiles) + raise ValueError('SMILES cannot be converted to a RDKit molecules:', + smiles) return fpdict[fp_name](m) From cef81bf9f5a725e40960c195ae7c319711e7f45e Mon Sep 17 00:00:00 2001 From: greg landrum Date: Wed, 30 Nov 2022 04:15:29 +0100 Subject: [PATCH 5/9] seems to work --- .../data_sets_II/calculate_scored_lists_RF.py | 10 +- .../calculate_scored_lists_XGB.py | 225 ++++++++++++++++++ .../calculate_scored_lists_lmnb.py | 213 +++++++++++++++++ scoring/fingerprint_lib.py | 50 ++-- scoring/ml_functions_13.py | 81 ++++--- 5 files changed, 510 insertions(+), 69 deletions(-) create mode 100644 scoring/data_sets_II/calculate_scored_lists_XGB.py create mode 100644 scoring/data_sets_II/calculate_scored_lists_lmnb.py diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py index e40d390..21dc3c1 100644 --- a/scoring/data_sets_II/calculate_scored_lists_RF.py +++ b/scoring/data_sets_II/calculate_scored_lists_RF.py @@ -55,8 +55,7 @@ import pickle, gzip, sys, os, os.path, numpy from collections import defaultdict from optparse import OptionParser -from sklearn.ensemble import RandomForestClassifier, forest -from sklearn.tree import tree +from sklearn.ensemble import RandomForestClassifier from rdkit.ML.Data import DataUtils from multiprocessing import Pool @@ -85,7 +84,6 @@ read_dict = {} read_dict['criterion'] = lambda x: x read_dict['max_depth'] = lambda x: int(x) -read_dict['max_features'] = lambda x: x read_dict['num_estimators'] = lambda x: int(x) read_dict['min_samples_split'] = lambda x: int(x) read_dict['min_samples_leaf'] = lambda x: int(x) @@ -99,7 +97,7 @@ parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with") parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)") parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg") -parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, max_features=auto (=sqrt), num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)") +parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)") parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)") ############# MAIN PART ######################## @@ -129,12 +127,12 @@ scor.checkSimil(simil_metric) # default machine-learning method variables - ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) + ml_dict = dict(criterion='gini', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) if options.ml: ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) # initialize machine-learning method - ml = RandomForestClassifier(criterion=ml_dict['criterion'], max_features=ml_dict['max_features'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs']) + ml = RandomForestClassifier(criterion=ml_dict['criterion'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs']) # loop over targets for target in conf.set_data: diff --git a/scoring/data_sets_II/calculate_scored_lists_XGB.py b/scoring/data_sets_II/calculate_scored_lists_XGB.py new file mode 100644 index 0000000..3e65dad --- /dev/null +++ b/scoring/data_sets_II/calculate_scored_lists_XGB.py @@ -0,0 +1,225 @@ +# +# calculates fingerprints and scores lists +# based on the predicted probability +# +# INPUT +# required: +# -f [] : fingerprint to build the random forest with +# optional: +# -o [] : relative output path (default: pwd) +# -a : append to the output file (default: overwrite) +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, +# McConnaughey, Manhattan, RogotGoldberg) +# -r [] : file containing the random forest info +# default parameters: criterion=gini, max_depth=10, +# max_features=auto (=sqrt), num_estimators=100, +# min_samples_split=2, min_samples_leaf=1, n_jobs=1 +# --help : prints usage +# +# OUTPUT: for each target in each data set +# a file with a list (1 element) of RF prediction +# per RF prediction: [name, list of 50 scored lists] +# +# Copyright (c) 2022, Greg Landrum +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +from rdkit import Chem, DataStructs +import pickle, gzip, sys, os, os.path, numpy +from collections import defaultdict +from optparse import OptionParser +from xgboost import XGBClassifier +from rdkit.ML.Data import DataUtils +from multiprocessing import Pool + +# import configuration file with global variables +sys.path.insert(0, os.getcwd()+'/../../') +import configuration_file_II as conf + +# import functions for scoring step +sys.path.insert(0, os.getcwd()+'/../') +import scoring_functions as scor + +# import ML functions +import ml_functions_13 as ml_func + +# paths +cwd = os.getcwd() +parentpath = cwd+'/../../' +inpath_cmp = parentpath+'compounds/' +inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/' +path = cwd+'/' + +# flag to read in ChEMBL decoys only once +firstchembl = True + +# dictionary for readMLFile() +read_dict = {} +read_dict['max_depth'] = lambda x: int(x) +read_dict['num_estimators'] = lambda x: int(x) +read_dict['n_jobs'] = lambda x: int(x) + +# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees + +# prepare command-line option parser +usage = "usage: %prog [options] arg" +parser = OptionParser(usage) +parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with") +parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)") +parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg") +parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)") +parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)") + +############# MAIN PART ######################## +if __name__=='__main__': + + # read in command line options + (options, args) = parser.parse_args() + # required arguments + if options.fp: + fp_build = options.fp + else: + raise RuntimeError('one or more of the required options was not given!') + + # optional arguments + do_append = False + if options.do_append: do_append = options.do_append + simil_metric = 'Dice' + if options.simil: simil_metric = options.simil + outpath = path + outpath_set = False + if options.outpath: + outpath_set = True + outpath = path+options.outpath + + # check for sensible input + if outpath_set: scor.checkPath(outpath, 'output') + scor.checkSimil(simil_metric) + + # default machine-learning method variables + ml_dict = dict( n_jobs=4, max_depth=3, num_estimators=300) + if options.ml: + ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) + + # initialize machine-learning method + ml = XGBClassifier(max_depth=ml_dict['max_depth'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs']) + + # loop over targets + for target in conf.set_data: + print( target) + + # read in training actives and calculate fps + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) + for k in actives.keys(): + for i,m in enumerate(actives[k]): + fp_dict = scor.getFP(fp_build, m[1]) + actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict] + + # read in test actives and calculate fps + div_actives = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + div_actives.append([line[1], fp_dict]) + num_test_actives = conf.num_div_act - 1 + # convert fps to numpy arrays + np_fps_div_act = ml_func.getNumpy(div_actives) + + # read in decoys and calculate fps + if firstchembl: + decoys = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + decoys.append([line[1], fp_dict]) + # convert fps to numpy arrays + np_fps_dcy = ml_func.getNumpy(decoys) + firstchembl = False + num_decoys = len(decoys) + print( "molecules read in and fingerprints calculated") + + # open training and test lists + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') + # to store the scored lists + scores = defaultdict(list) + # loop over repetitions + for q in actives.keys(): + print( q) + num_actives = len(actives[q]) + np_fps_act = ml_func.getNumpy(actives[q]) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) + test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] + + # list with active/inactive info + ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives) + # training fps + train_fps = [actives[q][i][1] for i in range(num_actives)] + np_train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]] + # fit random forest + ml.fit(np_train_fps, ys_fit) + + # test fps and molecule info + test_fps = [div_actives[i][1] for i in test_list[:num_test_actives]] + test_fps += [decoys[i][1] for i in test_list[num_test_actives:]] + np_test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]] + np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]] + test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]] + test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] + + # calculate similarity with standard fp + std_simil = [] + for fp in test_fps: + tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric) + tmp_simil.sort(reverse=True) + std_simil.append(tmp_simil[0]) + + # rank based on probability (and second based on similarity) + single_score = ml.predict_proba(np_test_fps) + # store: [probability, similarity, internal ID, active/inactive] + single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)] + single_score.sort(reverse=True) + scores['xgb_'+fp_build].append(single_score) + + # write scores to file + if do_append: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format + else: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format + for fp in ['xgb_'+fp_build]: + pickle.dump([fp, scores[fp]], outfile, 2) + outfile.close() + print( "scoring done and scored lists written") diff --git a/scoring/data_sets_II/calculate_scored_lists_lmnb.py b/scoring/data_sets_II/calculate_scored_lists_lmnb.py new file mode 100644 index 0000000..4ce4405 --- /dev/null +++ b/scoring/data_sets_II/calculate_scored_lists_lmnb.py @@ -0,0 +1,213 @@ +# +# calculates fingerprints and scores lists +# based on the predicted probability +# +# INPUT +# required: +# -f [] : fingerprint to build the Naive Bayes with +# optional: +# -o [] : relative output path (default: pwd) +# -a : append to the output file (default: overwrite) +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, +# McConnaughey, Manhattan, RogotGoldberg) +# -r [] : file containing the Naive Bayes info +# default parameters: alpha=1.0, binarize=None, +# fit_prior=1 (True) +# --help : prints usage +# +# OUTPUT: for each target in each data set +# a file with a list (1 element) of NB prediction +# per NB prediction: [name, list of 50 scored lists] +# +# Copyright (c) 2022, Greg Landrum +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +from rdkit import Chem, DataStructs +import pickle, gzip, sys, os, os.path, numpy +from collections import defaultdict +from optparse import OptionParser +from bayes.LaplacianNB import LaplacianNB + +# import configuration file with global variables +sys.path.insert(0, os.getcwd()+'/../../') +import configuration_file_II as conf + +# import functions for scoring step +sys.path.insert(0, os.getcwd()+'/../') +import scoring_functions as scor + +# import ML functions +import ml_functions_13 as ml_func + +# paths +cwd = os.getcwd() +parentpath = cwd+'/../../' +inpath_cmp = parentpath+'compounds/' +inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/' +path = cwd+'/' + +# flag to read in ChEMBL decoys only once +firstchembl = True + +# dictionary for readMLFile() +read_dict = {} +read_dict['alpha'] = lambda x: float(x) +read_dict['fit_prior'] = lambda x: bool(x) + +# prepare command-line option parser +usage = "usage: %prog [options] arg" +parser = OptionParser(usage) +parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train Naive Bayes with") +parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)") +parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg") +parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the Naive Bayes info (default parameters: alpha=1.0, binarize=None, fit_prior=1 (True))") +parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)") + +############# MAIN PART ######################## +if __name__=='__main__': + + # read in command line options + (options, args) = parser.parse_args() + # required arguments + if options.fp: + fp_build = options.fp + else: + raise RuntimeError('one or more of the required options was not given!') + + # optional arguments + do_append = False + if options.do_append: do_append = options.do_append + simil_metric = 'Dice' + if options.simil: simil_metric = options.simil + outpath = path + outpath_set = False + if options.outpath: + outpath_set = True + outpath = path+options.outpath + + # check for sensible input + if outpath_set: scor.checkPath(outpath, 'output') + scor.checkSimil(simil_metric) + + # default machine-learning method variables + ml_dict = dict(alpha=1.0, fit_prior=True) + if options.ml: + ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) + + # initialize machine-learning method + ml = LaplacianNB(alpha=ml_dict['alpha'], fit_prior=ml_dict['fit_prior']) + + # loop over targets + for target in conf.set_data: + print(target) + + # read in training actives and calculate fps + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) + for k in actives.keys(): + for i,m in enumerate(actives[k]): + fp_dict = scor.getFP(fp_build, m[1]) + actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict] + + # read in test actives and calculate fps + div_actives = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + div_actives.append([line[1], fp_dict]) + num_test_actives = conf.num_div_act - 1 + # convert fps to numpy arrays + np_fps_div_act = ml_func.getNumpy(div_actives,dtyp=int) + + # read in decoys and calculate fps + if firstchembl: + decoys = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + decoys.append([line[1], fp_dict]) + # convert fps to numpy arrays + np_fps_dcy = ml_func.getNumpy(decoys,dtyp=int) + firstchembl = False + num_decoys = len(decoys) + print("molecules read in and fingerprints calculated") + + # open training and test lists + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') + # to store the scored lists + scores = defaultdict(list) + # loop over repetitions + for q in actives.keys(): + print(q) + num_actives = len(actives[q]) + np_fps_act = ml_func.getNumpy(actives[q],dtyp=int) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) + test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] + + # list with active/inactive info + ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives) + # training fps + train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]] + # lmnb wants sets of on-bits + train_fps = [set([i for i,x in enumerate(fp) if x]) for fp in train_fps] + # fit Naive Bayes + ml.fit(train_fps, ys_fit) + + # test fps and molecule info + test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]] + test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]] + test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]] + test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] + + test_fps = [set([i for i,x in enumerate(fp) if x]) for fp in test_fps] + # rank based on probability + single_score = ml.predict_proba(test_fps) + # store: [probability, internal ID, active/inactive] + single_score = [[s[1], m[0], m[1]] for s,m in zip(single_score, test_mols)] + single_score.sort(reverse=True) + scores['lmnb_'+fp_build].append(single_score) + + # write scores to file + if do_append: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format + else: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format + for fp in ['lmnb_'+fp_build]: + pickle.dump([fp, scores[fp]], outfile, 2) + outfile.close() + print("scoring done and scored lists written") + #break diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py index ac22b00..56b9fc7 100644 --- a/scoring/fingerprint_lib.py +++ b/scoring/fingerprint_lib.py @@ -29,14 +29,14 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs): # implemented fingerprints: -# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), +# mfc0 (mfc0), mfp0 (mfp0), MACCS (maccs), # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt) # hashed atom pairs (hashap), hashed topological torsions (hashtt) --> with 1024 bits -# ECFP4 (ecfp4), ECFP6 (ecfp6), ECFC4 (ecfc4), ECFC6 (ecfc6) --> with 1024 bits -# FCFP4 (fcfp4), FCFP6 (fcfp6), FCFC4 (fcfc4), FCFC6 (fcfc6) --> with 1024 bits +# mfp2 (mfp2), mfp3 (mfp3), mfc2 (mfc2), mfc3 (mfc3) --> with 1024 bits +# fmfp2 (fmfp2), fmfp3 (fmfp3), fmfc2 (fmfc2), fmfc3 (fmfc3) --> with 1024 bits # Avalon (avalon) --> with 1024 bits # long Avalon (laval) --> with 16384 bits -# long ECFP4 (lecfp4), long ECFP6 (lecfp6), long FCFP4 (lfcfp4), long FCFP6 (lfcfp6) --> with 16384 bits +# long mfp2 (lmfp2), long mfp3 (lmfp3), long fmfp2 (lfmfp2), long fmfp3 (lfmfp3) --> with 16384 bits # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7) # 2D pharmacophore (pharm) ????????????? @@ -45,37 +45,37 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs): # dictionary fpdict = {} -fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['mfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 0, nBits=nbits) -fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['mfp1'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 1, nBits=nbits) -fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['mfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=nbits) -fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['mfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, nBits=nbits) -fpdict['ecfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0) -fpdict['ecfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1) -fpdict['ecfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2) -fpdict['ecfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3) -fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['mfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0) +fpdict['mfc1'] = lambda m: AllChem.GetMorganFingerprint(m, 1) +fpdict['mfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 2) +fpdict['mfc3'] = lambda m: AllChem.GetMorganFingerprint(m, 3) +fpdict['fmfp1'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 1, useFeatures=True, nBits=nbits) -fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['fmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=nbits) -fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['fmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits) -fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint( +fpdict['fmfc1'] = lambda m: AllChem.GetMorganFingerprint( m, 1, useFeatures=True) -fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint( +fpdict['fmfc2'] = lambda m: AllChem.GetMorganFingerprint( m, 2, useFeatures=True) -fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint( +fpdict['fmfc3'] = lambda m: AllChem.GetMorganFingerprint( m, 3, useFeatures=True) -fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['lmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=longbits) -fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['lmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, nBits=longbits) -fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['lfmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=longbits) -fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( +fpdict['lfmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=longbits) fpdict['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m) fpdict['ap'] = lambda m: rdMolDescriptors.GetAtomPairFingerprint(m) @@ -86,6 +86,12 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs): fpdict[ 'hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits) +fpdict[ + 'lhashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( + m, nBits=longbits) +fpdict[ + 'lhashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( + m, nBits=longbits) fpdict['avalon'] = lambda m: fpAvalon.GetAvalonFP(m, nbits) fpdict['laval'] = lambda m: fpAvalon.GetAvalonFP(m, longbits) fpdict['rdk5'] = lambda m: Chem.RDKFingerprint( diff --git a/scoring/ml_functions_13.py b/scoring/ml_functions_13.py index d26ed6a..5fa077b 100644 --- a/scoring/ml_functions_13.py +++ b/scoring/ml_functions_13.py @@ -37,59 +37,58 @@ from rdkit.ML.Data import DataUtils import numpy from multiprocessing import Pool -from sklearn.ensemble import RandomForestClassifier, forest -from sklearn.tree import tree +from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import BernoulliNB -### FOR SKLEARN VERSION 0.13 ### +# ### FOR SKLEARN VERSION 0.13 ### -# HELPER FUNCTIONS FOR RANDOM FOREST -def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose): - """Private function used to build a batch of trees within a job""" - from sklearn.utils import check_random_state - from sklearn.utils.fixes import bincount - import random - MAX_INT = numpy.iinfo(numpy.int32).max - random_state = check_random_state(seed) +# # HELPER FUNCTIONS FOR RANDOM FOREST +# def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose): +# """Private function used to build a batch of trees within a job""" +# from sklearn.utils import check_random_state +# from sklearn.utils.fixes import bincount +# import random +# MAX_INT = numpy.iinfo(numpy.int32).max +# random_state = check_random_state(seed) - trees = [] - for i in xrange(n_trees): - if verbose > 1: - print("building tree %d of %d" % (i+1, n_trees)) - seed = random_state.randint(MAX_INT) +# trees = [] +# for i in xrange(n_trees): +# if verbose > 1: +# print("building tree %d of %d" % (i+1, n_trees)) +# seed = random_state.randint(MAX_INT) - tree = forest._make_estimator(append = False) - tree.set_params(compute_importances=forest.compute_importances) - tree.set_params(random_state = check_random_state(seed)) +# tree = forest._make_estimator(append = False) +# tree.set_params(compute_importances=forest.compute_importances) +# tree.set_params(random_state = check_random_state(seed)) - if forest.bootstrap: - n_samples = X.shape[0] - if sample_weight is None: - curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64) - else: - curr_sample_weight = sample_weight.copy() +# if forest.bootstrap: +# n_samples = X.shape[0] +# if sample_weight is None: +# curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64) +# else: +# curr_sample_weight = sample_weight.copy() - ty = list(enumerate(y)) - indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0] - indices2 = random_state.randint(0, len(indices), len(indices)) - indices = [indices[j] for j in indices2] - sample_counts = bincount(indices, minlength=n_samples) +# ty = list(enumerate(y)) +# indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0] +# indices2 = random_state.randint(0, len(indices), len(indices)) +# indices = [indices[j] for j in indices2] +# sample_counts = bincount(indices, minlength=n_samples) - curr_sample_weight *= sample_counts - curr_sample_mask = sample_mask.copy() - curr_sample_mask[sample_counts==0] = False +# curr_sample_weight *= sample_counts +# curr_sample_mask = sample_mask.copy() +# curr_sample_mask[sample_counts==0] = False - tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False) - tree.indices = curr_sample_mask - else: - tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False) - trees.append(tree) - return trees +# tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False) +# tree.indices = curr_sample_mask +# else: +# tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False) +# trees.append(tree) +# return trees -def getNumpy(inlist): +def getNumpy(inlist,dtyp=float): outlist = [] for i in inlist: - arr = numpy.zeros((3,), tree.DTYPE) + arr = numpy.zeros((3,), dtyp) DataStructs.ConvertToNumpyArray(i[1], arr) outlist.append(arr) return outlist From 23e826d2948dfb669e3731255e5038ea8d357edb Mon Sep 17 00:00:00 2001 From: greg landrum Date: Wed, 30 Nov 2022 04:47:55 +0100 Subject: [PATCH 6/9] additional validation funcs --- validation/validation_functions.py | 105 ++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/validation/validation_functions.py b/validation/validation_functions.py index e8ffd21..a1b1861 100644 --- a/validation/validation_functions.py +++ b/validation/validation_functions.py @@ -37,6 +37,7 @@ import os from collections import defaultdict from rdkit.ML.Scoring import Scoring +from sklearn.metrics import roc_auc_score, average_precision_score def checkPaths(filepaths): @@ -45,21 +46,27 @@ def checkPaths(filepaths): if not os.path.exists(f): raise IOError('path does not exist:', f) + def _readMethods(line): '''Helper function for readMethods()''' - if line: # if params are provided + if line: # if params are provided params = [] - for i in line: params.append(float(i)) + for i in line: + params.append(float(i)) else: raise ValueError("Method requires parameters.") return params + # dictionary for readMethods() read_dict = {} read_dict['AUC'] = lambda l: EvalMethod(l[0]) read_dict['EF'] = lambda l: EFMethod(l[0], _readMethods(l[1:]), 100) read_dict['BEDROC'] = lambda l: BEDROCMethod(l[0], _readMethods(l[1:]), 1) read_dict['RIE'] = lambda l: RIEMethod(l[0], _readMethods(l[1:]), 1) +read_dict['AUROC'] = lambda l: AUROCEvalMethod(l[0]) +read_dict['AUPRC'] = lambda l: AUPRCEvalMethod(l[0]) + def readMethods(filepath): '''Reads the methods names and parameters from a file''' @@ -70,11 +77,12 @@ def readMethods(filepath): else: method_dict = {} for line in myfile: - if line[0] != "#": # ignore comments + if line[0] != "#": # ignore comments line = line.rstrip().split() method_dict[line[0]] = read_dict[line[0]](line) return method_dict + def readFPs(filepath): '''Reads a list of fingerprints from a file''' try: @@ -84,35 +92,38 @@ def readFPs(filepath): else: fps = [] for line in myfile: - if line[0] != "#": # ignore comments + if line[0] != "#": # ignore comments line = line.rstrip().split() fps.append(line[0]) return fps + def printInputParam(method_dict, inpath): '''Prints the input parameters''' - print( "-------------------------------") - print( "PARAMETERS USED") - print( "Validation methods: ") + print("-------------------------------") + print("PARAMETERS USED") + print("Validation methods: ") for m in method_dict.keys(): if isinstance(method_dict[m], ParamEvalMethod): - print( m, "- parameters:", method_dict[m].params) + print(m, "- parameters:", method_dict[m].params) else: - print( m) - print( "") - print( "Input paths:") + print(m) + print("") + print("Input paths:") for inp in inpath: - print( inp) - print( "-------------------------------") + print(inp) + print("-------------------------------") + def printFPs(fps): '''Prints a list of fingerprints''' - print( "-------------------------------") - print( "FINGERPRINTS CONSIDERED") + print("-------------------------------") + print("FINGERPRINTS CONSIDERED") for fp in fps: - print( " ",fp) - print( "") - print( "-------------------------------") + print(" ", fp) + print("") + print("-------------------------------") + def getName(fp, fp_names): '''Determines the new name of a fingerprint in case @@ -120,34 +131,61 @@ def getName(fp, fp_names): # check if fp already exists. if yes, add a number if fp in fp_names: suffix = 2 - tmp_name = fp+'_'+str(suffix) + tmp_name = fp + '_' + str(suffix) while tmp_name in fp_names: suffix += 1 - tmp_name = fp+'_'+str(suffix) + tmp_name = fp + '_' + str(suffix) return tmp_name else: return fp + # class for handling of evaluation methods class EvalMethod: def __init__(self, name): self.method_name = name self.names = name + def addNames(self, results): results[self.method_name] = defaultdict(list) + def calculate(self, score, index): - return Scoring.CalcAUC(score,index) + return Scoring.CalcAUC(score, index) + def runMethod(self, results, scores, query, index): tmp_list = [] - for k in scores.keys(): # fingerprints + for k in scores.keys(): # fingerprints tmp = self.calculate(scores[k][query], index) tmp_list.append([tmp, k]) # sort list according to the descending score tmp_list.sort(reverse=True) # store [score, rank] - for i,l in enumerate(tmp_list): + for i, l in enumerate(tmp_list): # l[1] = fp, l[0] = score, i+1 = rank - results[self.method_name][l[1]].append([l[0], i+1]) + results[self.method_name][l[1]].append([l[0], i + 1]) + + +class AUROCEvalMethod(EvalMethod): + def __init__(self, name): + self.method_name = name + self.names = name + + def calculate(self, score, index): + scores = [x[0] for x in score] + acts = [x[index] for x in score] + return roc_auc_score(acts, scores) + + +class AUPRCEvalMethod(EvalMethod): + def __init__(self, name): + self.method_name = name + self.names = name + + def calculate(self, score, index): + scores = [x[0] for x in score] + acts = [x[index] for x in score] + return average_precision_score(acts, scores) + class ParamEvalMethod(EvalMethod): def __init__(self, name, params, factor): @@ -155,10 +193,12 @@ def __init__(self, name, params, factor): self.params = params self.names = [] for p in self.params: - self.names.append(name + str(int(factor*p))) + self.names.append(name + str(int(factor * p))) + def addNames(self, results): for n in self.names: results[n] = defaultdict(list) + def runMethod(self, results, scores, query, index): tmp_list = [[] for i in range(len(self.names))] # loop over fingerprints @@ -168,28 +208,31 @@ def runMethod(self, results, scores, query, index): for i in range(len(self.names)): tmp_list[i].append([tmp[i], k]) # loop over parameters - for i,n in enumerate(self.names): + for i, n in enumerate(self.names): # sort list according to the descending score tmp_list[i].sort(reverse=True) # store [score, rank] - for j,l in enumerate(tmp_list[i]): + for j, l in enumerate(tmp_list[i]): # l[1] = fp, l[0] = score, j+1 = rank - results[n][l[1]].append([l[0], j+1]) + results[n][l[1]].append([l[0], j + 1]) + class EFMethod(ParamEvalMethod): def calculate(self, score, index): - return Scoring.CalcEnrichment(score,index,self.params) + return Scoring.CalcEnrichment(score, index, self.params) + class BEDROCMethod(ParamEvalMethod): def calculate(self, score, index): tmp = [] for p in self.params: - tmp.append(Scoring.CalcBEDROC(score,index,p)) + tmp.append(Scoring.CalcBEDROC(score, index, p)) return tmp + class RIEMethod(ParamEvalMethod): def calculate(self, score, index): tmp = [] for p in self.params: - tmp.append(Scoring.CalcRIE(score,index,p)) + tmp.append(Scoring.CalcRIE(score, index, p)) return tmp From a1225b08904cd7775690af55288a400522841cf1 Mon Sep 17 00:00:00 2001 From: greg landrum Date: Sun, 18 Dec 2022 11:17:14 +0100 Subject: [PATCH 7/9] add BRF scorer --- .../calculate_scored_lists_BRF.py | 231 ++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 scoring/data_sets_II/calculate_scored_lists_BRF.py diff --git a/scoring/data_sets_II/calculate_scored_lists_BRF.py b/scoring/data_sets_II/calculate_scored_lists_BRF.py new file mode 100644 index 0000000..84ddcf0 --- /dev/null +++ b/scoring/data_sets_II/calculate_scored_lists_BRF.py @@ -0,0 +1,231 @@ +# +# calculates fingerprints and scores lists +# based on the predicted probability +# +# INPUT +# required: +# -f [] : fingerprint to build the balanced random forest with +# optional: +# -o [] : relative output path (default: pwd) +# -a : append to the output file (default: overwrite) +# -s [] : similarity metric (default: Dice, +# other options: Tanimoto, Cosine, Russel, Kulczynski, +# McConnaughey, Manhattan, RogotGoldberg) +# -r [] : file containing the random forest info +# default parameters: criterion=gini, max_depth=10, +# max_features=auto (=sqrt), num_estimators=100, +# min_samples_split=2, min_samples_leaf=1, n_jobs=1 +# --help : prints usage +# +# OUTPUT: for each target in each data set +# a file with a list (1 element) of RF prediction +# per RF prediction: [name, list of 50 scored lists] +# +# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote +# products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +from rdkit import Chem, DataStructs +import pickle, gzip, sys, os, os.path, numpy +from collections import defaultdict +from optparse import OptionParser +from imblearn.ensemble import BalancedRandomForestClassifier +from rdkit.ML.Data import DataUtils +from multiprocessing import Pool + +# import configuration file with global variables +sys.path.insert(0, os.getcwd()+'/../../') +import configuration_file_II as conf + +# import functions for scoring step +sys.path.insert(0, os.getcwd()+'/../') +import scoring_functions as scor + +# import ML functions +import ml_functions_13 as ml_func + +# paths +cwd = os.getcwd() +parentpath = cwd+'/../../' +inpath_cmp = parentpath+'compounds/' +inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/' +path = cwd+'/' + +# flag to read in ChEMBL decoys only once +firstchembl = True + +# dictionary for readMLFile() +read_dict = {} +read_dict['criterion'] = lambda x: x +read_dict['max_depth'] = lambda x: int(x) +read_dict['num_estimators'] = lambda x: int(x) +read_dict['min_samples_split'] = lambda x: int(x) +read_dict['min_samples_leaf'] = lambda x: int(x) +read_dict['n_jobs'] = lambda x: int(x) + +# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees + +# prepare command-line option parser +usage = "usage: %prog [options] arg" +parser = OptionParser(usage) +parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with") +parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)") +parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg") +parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)") +parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)") + +############# MAIN PART ######################## +if __name__=='__main__': + + # read in command line options + (options, args) = parser.parse_args() + # required arguments + if options.fp: + fp_build = options.fp + else: + raise RuntimeError('one or more of the required options was not given!') + + # optional arguments + do_append = False + if options.do_append: do_append = options.do_append + simil_metric = 'Dice' + if options.simil: simil_metric = options.simil + outpath = path + outpath_set = False + if options.outpath: + outpath_set = True + outpath = path+options.outpath + + # check for sensible input + if outpath_set: scor.checkPath(outpath, 'output') + scor.checkSimil(simil_metric) + + # default machine-learning method variables + ml_dict = dict(criterion='gini', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100) + if options.ml: + ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) + + # initialize machine-learning method + ml = BalancedRandomForestClassifier(criterion=ml_dict['criterion'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs']) + + # loop over targets + for target in conf.set_data: + print( target) + + # read in training actives and calculate fps + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) + for k in actives.keys(): + for i,m in enumerate(actives[k]): + fp_dict = scor.getFP(fp_build, m[1]) + actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict] + + # read in test actives and calculate fps + div_actives = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + div_actives.append([line[1], fp_dict]) + num_test_actives = conf.num_div_act - 1 + # convert fps to numpy arrays + np_fps_div_act = ml_func.getNumpy(div_actives) + + # read in decoys and calculate fps + if firstchembl: + decoys = [] + for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') + if line[0] != '#': + # structure of line: [external ID, internal ID, SMILES]] + line = line.rstrip().split() + fp_dict = scor.getFP(fp_build, line[2]) + # store: [internal ID, dict with fps] + decoys.append([line[1], fp_dict]) + # convert fps to numpy arrays + np_fps_dcy = ml_func.getNumpy(decoys) + firstchembl = False + num_decoys = len(decoys) + print( "molecules read in and fingerprints calculated") + + # open training and test lists + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') + # to store the scored lists + scores = defaultdict(list) + # loop over repetitions + for q in actives.keys(): + print( q) + num_actives = len(actives[q]) + np_fps_act = ml_func.getNumpy(actives[q]) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) + test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] + + # list with active/inactive info + ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives) + # training fps + train_fps = [actives[q][i][1] for i in range(num_actives)] + np_train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]] + # fit random forest + ml.fit(np_train_fps, ys_fit) + + # test fps and molecule info + test_fps = [div_actives[i][1] for i in test_list[:num_test_actives]] + test_fps += [decoys[i][1] for i in test_list[num_test_actives:]] + np_test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]] + np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]] + test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]] + test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] + + # calculate similarity with standard fp + std_simil = [] + for fp in test_fps: + tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric) + tmp_simil.sort(reverse=True) + std_simil.append(tmp_simil[0]) + + # rank based on probability (and second based on similarity) + single_score = ml.predict_proba(np_test_fps) + # store: [probability, similarity, internal ID, active/inactive] + single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)] + single_score.sort(reverse=True) + scores['brf_'+fp_build].append(single_score) + + # write scores to file + if do_append: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format + else: + outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format + for fp in ['brf_'+fp_build]: + pickle.dump([fp, scores[fp]], outfile, 2) + outfile.close() + print( "scoring done and scored lists written") From 7f6df3cac7dd023be659863e9a2fa27bc3cccb3f Mon Sep 17 00:00:00 2001 From: greg landrum Date: Sun, 18 Dec 2022 11:17:26 +0100 Subject: [PATCH 8/9] update NB scorer --- .../data_sets_II/calculate_scored_lists_NB.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/scoring/data_sets_II/calculate_scored_lists_NB.py b/scoring/data_sets_II/calculate_scored_lists_NB.py index 12669ed..d59d21f 100644 --- a/scoring/data_sets_II/calculate_scored_lists_NB.py +++ b/scoring/data_sets_II/calculate_scored_lists_NB.py @@ -51,7 +51,8 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path, numpy +import pickle, gzip, sys, os, os.path +import numpy as np from collections import defaultdict from optparse import OptionParser from sklearn.naive_bayes import BernoulliNB @@ -128,10 +129,10 @@ # loop over targets for target in conf.set_data: - print target + print(target) # read in training actives and calculate fps - actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r')) + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) for k in actives.keys(): for i,m in enumerate(actives[k]): fp_dict = scor.getFP(fp_build, m[1]) @@ -140,6 +141,7 @@ # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -154,6 +156,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -164,20 +167,20 @@ np_fps_dcy = ml_func.getNumpy(decoys) firstchembl = False num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print("molecules read in and fingerprints calculated") # open training and test lists - training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r') - test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r') + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over repetitions for q in actives.keys(): - print q + print(q) num_actives = len(actives[q]) np_fps_act = ml_func.getNumpy(actives[q]) - training_list = cPickle.load(training_input) - test_list = cPickle.load(test_input) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] # list with active/inactive info @@ -206,6 +209,6 @@ else: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['nb_'+fp_build]: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print("scoring done and scored lists written") From 59fd09ba1049ff137ba81f3b4e77b4b85ed30072 Mon Sep 17 00:00:00 2001 From: greg landrum Date: Mon, 19 Dec 2022 04:31:55 +0100 Subject: [PATCH 9/9] basics --- .../data_sets_II/calculate_scored_lists.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/scoring/data_sets_II/calculate_scored_lists.py b/scoring/data_sets_II/calculate_scored_lists.py index 06a0deb..bb196ed 100644 --- a/scoring/data_sets_II/calculate_scored_lists.py +++ b/scoring/data_sets_II/calculate_scored_lists.py @@ -48,7 +48,7 @@ # from rdkit import Chem, DataStructs -import cPickle, gzip, sys, os, os.path +import pickle, gzip, sys, os, os.path from collections import defaultdict from optparse import OptionParser @@ -108,10 +108,10 @@ # loop over targets for target in conf.set_data: - print target + print(target) # read in training actives and calculate fps - actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r')) + actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb')) for k in actives.keys(): for i,m in enumerate(actives[k]): fp_dict = scor.getFPDict(fp_names, m[1]) @@ -120,6 +120,7 @@ # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -132,6 +133,7 @@ if firstchembl: decoys = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'): + line=line.decode('UTF-8') if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() @@ -140,19 +142,19 @@ decoys.append([line[1], fp_dict]) firstchembl = False num_decoys = len(decoys) - print "molecules read in and fingerprints calculated" + print("molecules read in and fingerprints calculated") # open training and test lists - training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r') - test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r') + training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb') + test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb') # to store the scored lists scores = defaultdict(list) # loop over papers for q in actives.keys(): num_actives = len(actives[q]) - training_list = cPickle.load(training_input) - test_list = cPickle.load(test_input) + training_list = pickle.load(training_input) + test_list = pickle.load(test_input) test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]] # loop over fps single_score = defaultdict(list) @@ -175,6 +177,6 @@ else: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in fp_names: - cPickle.dump([fp, scores[fp]], outfile, 2) + pickle.dump([fp, scores[fp]], outfile, 2) outfile.close() - print "scoring done and scored lists written" + print("scoring done and scored lists written")