From fe9ceb90e2d9676e8ac815f954c728bed73bc7ef Mon Sep 17 00:00:00 2001
From: Greg Landrum <greg.landrum@gmail.com>
Date: Sat, 12 Aug 2017 07:12:28 +0200
Subject: [PATCH 1/9] save

---
 analysis/data_sets_I/run_analysis.py          | 16 +++++-----
 .../data_sets_I/calculate_scored_lists_RF.py  | 32 +++++++++----------
 .../data_sets_II/calculate_scored_lists_RF.py | 28 ++++++++--------
 scoring/fingerprint_lib.py                    |  4 +--
 .../calculate_validation_methods.py           | 24 +++++++-------
 5 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/analysis/data_sets_I/run_analysis.py b/analysis/data_sets_I/run_analysis.py
index ad4215d..456e9f8 100644
--- a/analysis/data_sets_I/run_analysis.py
+++ b/analysis/data_sets_I/run_analysis.py
@@ -17,19 +17,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -101,7 +101,7 @@
         inpath = tmppath+'/'+dataset
 
         # loop over targets
-        for target in conf.set_data:
+        for target in conf.set_data[dataset]['ids']:
             print target
 
             # load results
diff --git a/scoring/data_sets_I/calculate_scored_lists_RF.py b/scoring/data_sets_I/calculate_scored_lists_RF.py
index 1f5c74e..3736d77 100644
--- a/scoring/data_sets_I/calculate_scored_lists_RF.py
+++ b/scoring/data_sets_I/calculate_scored_lists_RF.py
@@ -9,8 +9,8 @@
 # optional:
 # -o [] : relative output path (default: pwd)
 # -a : append to the output file (default: overwrite)
-# -s [] : similarity metric (default: Dice, 
-#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
 #         McConnaughey, Manhattan, RogotGoldberg)
 # -r [] : file containing the random forest info
 #          default parameters: criterion=gini, max_depth=10,
@@ -24,19 +24,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -55,7 +55,7 @@
 from rdkit import Chem, DataStructs
 import cPickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
-from optparse import OptionParser 
+from optparse import OptionParser
 from sklearn.ensemble import RandomForestClassifier, forest
 from sklearn.tree import tree
 from rdkit.ML.Data import DataUtils
@@ -92,7 +92,7 @@
 read_dict['min_samples_leaf'] = lambda x: int(x)
 read_dict['n_jobs'] = lambda x: int(x)
 
-forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
+#forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
 
 # prepare command-line option parser
 usage = "usage: %prog [options] arg"
@@ -110,7 +110,7 @@
     # read in command line options
     (options, args) = parser.parse_args()
     # required arguments
-    if options.num and options.fp: 
+    if options.num and options.fp:
         num_query_mols = options.num
         fp_build = options.fp
     else:
@@ -133,7 +133,7 @@
     scor.checkQueryMols(num_query_mols, conf.list_num_query_mols)
 
     # default machine-learning method variables
-    ml_dict = dict(criterion='gini', max_features='auto', n_jobs=1, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
+    ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
     if options.ml:
         ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
 
@@ -150,7 +150,7 @@
             # read in actives and calculate fps
             actives = []
             for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'):
-                if line[0] != '#': 
+                if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
                     fp_dict = scor.getFP(fp_build, line[2])
@@ -166,7 +166,7 @@
                 if firstchembl:
                     decoys = []
                     for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'):
-                        if line[0] != '#': 
+                        if line[0] != '#':
                             # structure of line: [external ID, internal ID, SMILES]]
                             line = line.rstrip().split()
                             fp_dict = scor.getFP(fp_build, line[2])
@@ -178,7 +178,7 @@
             else:
                 decoys = []
                 for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'):
-                    if line[0] != '#': 
+                    if line[0] != '#':
                         # structure of line: [external ID, internal ID, SMILES]]
                         line = line.rstrip().split()
                         fp_dict = scor.getFP(fp_build, line[2])
diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py
index f060364..a9b7e2b 100644
--- a/scoring/data_sets_II/calculate_scored_lists_RF.py
+++ b/scoring/data_sets_II/calculate_scored_lists_RF.py
@@ -8,8 +8,8 @@
 # optional:
 # -o [] : relative output path (default: pwd)
 # -a : append to the output file (default: overwrite)
-# -s [] : similarity metric (default: Dice, 
-#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
 #         McConnaughey, Manhattan, RogotGoldberg)
 # -r [] : file containing the random forest info
 #          default parameters: criterion=gini, max_depth=10,
@@ -23,19 +23,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -54,7 +54,7 @@
 from rdkit import Chem, DataStructs
 import cPickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
-from optparse import OptionParser 
+from optparse import OptionParser
 from sklearn.ensemble import RandomForestClassifier, forest
 from sklearn.tree import tree
 from rdkit.ML.Data import DataUtils
@@ -91,7 +91,7 @@
 read_dict['min_samples_leaf'] = lambda x: int(x)
 read_dict['n_jobs'] = lambda x: int(x)
 
-forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
+# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
 
 # prepare command-line option parser
 usage = "usage: %prog [options] arg"
@@ -108,7 +108,7 @@
     # read in command line options
     (options, args) = parser.parse_args()
     # required arguments
-    if options.fp: 
+    if options.fp:
         fp_build = options.fp
     else:
         raise RuntimeError('one or more of the required options was not given!')
@@ -150,7 +150,7 @@
         # read in test actives and calculate fps
         div_actives = []
         for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
-            if line[0] != '#': 
+            if line[0] != '#':
                 # structure of line: [external ID, internal ID, SMILES]]
                 line = line.rstrip().split()
                 fp_dict = scor.getFP(fp_build, line[2])
@@ -164,7 +164,7 @@
         if firstchembl:
             decoys = []
             for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
-                if line[0] != '#': 
+                if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
                     fp_dict = scor.getFP(fp_build, line[2])
diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py
index 9c7b606..c5a5fda 100644
--- a/scoring/fingerprint_lib.py
+++ b/scoring/fingerprint_lib.py
@@ -12,7 +12,7 @@
 from rdkit.Chem import rdMolDescriptors
 
 # implemented fingerprints:
-# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs), 
+# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs),
 # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt)
 # hashed atom pairs (hashap), hashed topological torsions (hashtt) --> with 1024 bits
 # ECFP4 (ecfp4), ECFP6 (ecfp6), ECFC4 (ecfc4), ECFC6 (ecfc6) --> with 1024 bits
@@ -23,7 +23,7 @@
 # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7)
 # 2D pharmacophore (pharm) ?????????????
 
-nbits = 1024
+nbits = 2048
 longbits = 16384
 
 # dictionary
diff --git a/validation/data_sets_I/calculate_validation_methods.py b/validation/data_sets_I/calculate_validation_methods.py
index 794e53d..d4402cf 100644
--- a/validation/data_sets_I/calculate_validation_methods.py
+++ b/validation/data_sets_I/calculate_validation_methods.py
@@ -7,7 +7,7 @@
 #
 # INPUT
 # required:
-# -m [] : file containing the methods 
+# -m [] : file containing the methods
 #         implemented methods are: AUC, BEDROC ([alpha] optional),
 #         RIE ([alpha] optional), EF ([percentage] optional)
 # optional:
@@ -23,19 +23,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -94,11 +94,11 @@
 
     # optional arguments
     inpath = parentpath+'scoring/'
-    if options.inpath: 
+    if options.inpath:
         inpath = [path+i for i in options.inpath]
         vfunc.checkPaths(inpath)
     outpath = path
-    if options.outpath: 
+    if options.outpath:
         outpath = path+options.outpath
         vfunc.checkPaths([outpath])
     remove_fps = []
@@ -125,7 +125,7 @@
             # load scored lists
             scores = {}
             for inp in inpath: # loop over input paths
-                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'r')
+                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'r')
                 while 1:
                     try:
                         tmp = cPickle.load(myfile)
@@ -150,7 +150,7 @@
             # loop of repetitions
             for q in range(conf.num_reps):
                 # loop over evaluation methods
-                for m in method_dict.keys(): 
+                for m in method_dict.keys():
                     method_dict[m].runMethod(results, scores, q, -1)
 
             print "validation methods calculated"

From f0763b26be8f6029316f43c0ec867d84ee3f6cf7 Mon Sep 17 00:00:00 2001
From: Greg Landrum <greg.landrum@gmail.com>
Date: Tue, 10 Oct 2017 04:23:51 +0200
Subject: [PATCH 2/9] get most everything working with python3

---
 analysis/data_sets_I/run_analysis.py          | 12 ++---
 analysis/data_sets_I/run_fp_summary.py        | 22 ++++----
 analysis/data_sets_I/run_method_summary.py    | 22 ++++----
 analysis/data_sets_II/run_analysis.py         | 22 ++++----
 analysis/data_sets_II/run_fp_summary.py       | 20 ++++----
 analysis/data_sets_II/run_method_summary.py   | 18 +++----
 scoring/data_sets_I/apply_fusion.py           | 36 ++++++-------
 scoring/data_sets_I/calculate_scored_lists.py | 49 +++++++++---------
 .../data_sets_I/calculate_scored_lists_LR.py  | 49 +++++++++---------
 .../data_sets_I/calculate_scored_lists_RF.py  | 21 ++++----
 .../data_sets_II/calculate_scored_lists_LR.py | 50 ++++++++++---------
 .../data_sets_II/calculate_scored_lists_RF.py | 26 +++++-----
 scoring/scoring_functions.py                  | 26 +++++-----
 .../calculate_validation_methods.py           | 18 +++----
 .../calculate_validation_methods.py           | 40 +++++++--------
 validation/validation_functions.py            | 33 ++++++------
 16 files changed, 238 insertions(+), 226 deletions(-)

diff --git a/analysis/data_sets_I/run_analysis.py b/analysis/data_sets_I/run_analysis.py
index 456e9f8..6ae1f8d 100644
--- a/analysis/data_sets_I/run_analysis.py
+++ b/analysis/data_sets_I/run_analysis.py
@@ -45,7 +45,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, math, sys, os, os.path
+import gzip, pickle, math, sys, os, os.path
 import numpy as np
 from scipy import special, stats
 from collections import defaultdict
@@ -92,7 +92,7 @@
 
     # loop over dataset sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # output directories and input directory
         outdir = outpath+'/'+dataset
         if not os.path.exists(outdir): os.makedirs(outdir)
@@ -102,12 +102,12 @@
 
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # load results
-            validation = cPickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'r'))
-            methodkeys = validation.keys()
-            fpkeys = validation[methodkeys[0]].keys()
+            validation = pickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'rb'))
+            methodkeys = list(validation.keys())
+            fpkeys = list(validation[methodkeys[0]].keys())
 
             # if ranks is not yet set: prepare it
             if len(ranks) == 0:
diff --git a/analysis/data_sets_I/run_fp_summary.py b/analysis/data_sets_I/run_fp_summary.py
index 085304c..d5f4d48 100644
--- a/analysis/data_sets_I/run_fp_summary.py
+++ b/analysis/data_sets_I/run_fp_summary.py
@@ -15,19 +15,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -43,7 +43,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, sys, os, os.path
+import gzip, pickle, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 
@@ -81,17 +81,17 @@
 
     # loop over dataset sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # input path
         inpath = outpath+'/'+dataset
 
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # load results
             results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r'))
-            methodkeys = results.keys()
+            methodkeys = list(results.keys())
 
             # if summary is not yet set: prepare it
             if len(summary) == 0:
diff --git a/analysis/data_sets_I/run_method_summary.py b/analysis/data_sets_I/run_method_summary.py
index c12e89e..048ad34 100644
--- a/analysis/data_sets_I/run_method_summary.py
+++ b/analysis/data_sets_I/run_method_summary.py
@@ -15,19 +15,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -43,7 +43,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, sys, os, os.path
+import gzip, pickle, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 
@@ -81,17 +81,17 @@
 
     # loop over dataset sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # input directory
         inpath = outpath+'/'+dataset
 
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # load results
             results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r'))
-            methodkeys = results.keys()
+            methodkeys = list(results.keys())
 
             # if summary is not yet set: prepare it
             if len(summary) == 0:
diff --git a/analysis/data_sets_II/run_analysis.py b/analysis/data_sets_II/run_analysis.py
index be0dc42..9a69177 100644
--- a/analysis/data_sets_II/run_analysis.py
+++ b/analysis/data_sets_II/run_analysis.py
@@ -17,19 +17,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -45,7 +45,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, math, sys, os, os.path
+import gzip, pickle, math, sys, os, os.path
 import numpy as np
 from scipy import special, stats
 from collections import defaultdict
@@ -99,11 +99,11 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # load results
-        validation = cPickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'r'))
-        methodkeys = validation.keys()
+        validation = pickle.load(gzip.open(inpath+'/validation_'+str(target)+'.pkl.gz', 'rb'))
+        methodkeys = list(validation.keys())
         fpkeys = validation[methodkeys[0]].keys()
 
         # if ranks is not yet set: prepare it
diff --git a/analysis/data_sets_II/run_fp_summary.py b/analysis/data_sets_II/run_fp_summary.py
index b28fe8a..8580cf5 100644
--- a/analysis/data_sets_II/run_fp_summary.py
+++ b/analysis/data_sets_II/run_fp_summary.py
@@ -15,19 +15,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -43,7 +43,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, sys, os, os.path
+import gzip, pickle, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 
@@ -84,11 +84,11 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # load results
         results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r'))
-        methodkeys = results.keys()
+        methodkeys = list(results.keys())
 
         # if summary is not yet set: prepare it
         if len(summary) == 0:
diff --git a/analysis/data_sets_II/run_method_summary.py b/analysis/data_sets_II/run_method_summary.py
index 66154ee..949e076 100644
--- a/analysis/data_sets_II/run_method_summary.py
+++ b/analysis/data_sets_II/run_method_summary.py
@@ -15,19 +15,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -43,7 +43,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, sys, os, os.path
+import gzip, pickle, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 
@@ -84,7 +84,7 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # load results
         results, fpkeys = ana_func.readFile(open(inpath+'/target_'+str(target)+'.txt', 'r'))
diff --git a/scoring/data_sets_I/apply_fusion.py b/scoring/data_sets_I/apply_fusion.py
index 7749be8..9f307b3 100644
--- a/scoring/data_sets_I/apply_fusion.py
+++ b/scoring/data_sets_I/apply_fusion.py
@@ -2,7 +2,7 @@
 # $Id$
 #
 # loads ranked lists from different
-# models and/or fingerprints and 
+# models and/or fingerprints and
 # apply rank-based fusion
 #
 # INPUT
@@ -22,19 +22,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -50,7 +50,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, math, sys, os, os.path
+import gzip, pickle, math, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 
@@ -101,7 +101,7 @@
     if options.rm_file:
         remove_fps = scor.readFPs(path+options.rm_file)
     outpath = path
-    if options.outpath: 
+    if options.outpath:
         outpath = path+options.outpath
         scor.checkPath(outpath, 'output')
     do_append = False
@@ -112,19 +112,19 @@
 
     # loop over data-set sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
 
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # load scored lists
             scores = {}
             for inp in inpath: # loop over input paths
-                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'r')
+                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'_.pkl.gz', 'rb')
                 while 1:
                     try:
-                        tmp = cPickle.load(myfile)
+                        tmp = pickle.load(myfile)
                     except (EOFError):
                         break
                     else:
@@ -133,9 +133,9 @@
                             tmp[0] = scor.getName(tmp[0], scores.keys())
                             # input line: [fp_name, list of scored lists]
                             scores[tmp[0]] = tmp[1]
-            print "scored lists read in"
+            print( "scored lists read in")
             if len(scores.keys()) < 2:
-                print "number of fingerprints/models < 2, nothing to be done"
+                print( "number of fingerprints/models < 2, nothing to be done")
                 break
             if printfp:
                 # determine the name of the fusion
@@ -176,6 +176,6 @@
                 outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'_'+'.pkl.gz', 'ab+') # binary format
             else:
                 outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'_'+'.pkl.gz', 'wb+') # binary format
-            cPickle.dump([fpname, new_scores], outfile, 2)
+            pickle.dump([fpname, new_scores], outfile, 2)
             outfile.close()
-        print "fusion ranking done and ranked list written"
+        print( "fusion ranking done and ranked list written")
diff --git a/scoring/data_sets_I/calculate_scored_lists.py b/scoring/data_sets_I/calculate_scored_lists.py
index 415c8f4..4c1f6ec 100644
--- a/scoring/data_sets_I/calculate_scored_lists.py
+++ b/scoring/data_sets_I/calculate_scored_lists.py
@@ -9,8 +9,8 @@
 # optional:
 # -o [] : relative output path (default: pwd)
 # -a : append to the output file (default: overwrite)
-# -s [] : similarity metric (default: Dice, 
-#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
 #         McConnaughey, Manhattan, RogotGoldberg)
 # --help : prints usage
 #
@@ -20,19 +20,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -49,9 +49,9 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path
+import pickle, gzip, sys, os, os.path
 from collections import defaultdict
-from optparse import OptionParser 
+from optparse import OptionParser
 
 # import configuration file with global variables
 sys.path.insert(0, os.getcwd()+'/../../')
@@ -86,7 +86,7 @@
     # read in command line options
     (options, args) = parser.parse_args()
     # required arguments
-    if options.num and options.fp_file: 
+    if options.num and options.fp_file:
         num_query_mols = options.num
         fp_file = path+options.fp_file
     else:
@@ -112,15 +112,16 @@
 
     # loop over data-set sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print(dataset)
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # read in actives and calculate fps
             actives = []
             for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'):
-                if line[0] != '#': 
+                line=line.decode('UTF-8')
+                if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
                     fp_dict = scor.getFPDict(fp_names, line[2])
@@ -134,7 +135,8 @@
                 if firstchembl:
                     decoys = []
                     for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'):
-                        if line[0] != '#': 
+                        line=line.decode('UTF-8')
+                        if line[0] != '#':
                             # structure of line: [external ID, internal ID, SMILES]]
                             line = line.rstrip().split()
                             fp_dict = scor.getFPDict(fp_names, line[2])
@@ -144,23 +146,24 @@
             else:
                 decoys = []
                 for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'):
-                    if line[0] != '#': 
+                    line=line.decode('UTF-8')
+                    if line[0] != '#':
                         # structure of line: [external ID, internal ID, SMILES]]
                         line = line.rstrip().split()
                         fp_dict = scor.getFPDict(fp_names, line[2])
                         # store: [internal ID, dict with fps]
                         decoys.append([line[1], fp_dict])
             num_decoys = len(decoys)
-            print "molecules read in and fingerprints calculated"
+            print( "molecules read in and fingerprints calculated")
 
             # open training lists
-            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r')
+            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb')
             # to store the scored lists
             scores = defaultdict(list)
 
             # loop over repetitions
             for q in range(conf.num_reps):
-                training_list = cPickle.load(training_input)
+                training_list = pickle.load(training_input)
                 test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]]
                 test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]]
                 # loop over fps
@@ -174,7 +177,7 @@
                         tmp_score = scor.getBulkSimilarity(tmp_mol[1], query_fps, simil_metric)
                         # use max fusion
                         # store : [similarity, internal ID, active/inactive]
-                        single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]]) 
+                        single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]])
                     # rank list according to similarity
                     scores[fp].append(sorted(single_score[fp], reverse=True))
 
@@ -184,6 +187,6 @@
             else:
                 outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format
             for fp in fp_names:
-                cPickle.dump([fp, scores[fp]], outfile, 2)
+                pickle.dump([fp, scores[fp]], outfile, 2)
             outfile.close()
-            print "scoring done and scored lists written"
+            print( "scoring done and scored lists written")
diff --git a/scoring/data_sets_I/calculate_scored_lists_LR.py b/scoring/data_sets_I/calculate_scored_lists_LR.py
index 4a7e804..81cd286 100644
--- a/scoring/data_sets_I/calculate_scored_lists_LR.py
+++ b/scoring/data_sets_I/calculate_scored_lists_LR.py
@@ -9,8 +9,8 @@
 # optional:
 # -o [] : relative output path (default: pwd)
 # -a : append to the output file (default: overwrite)
-# -s [] : similarity metric (default: Dice, 
-#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
 #         McConnaughey, Manhattan, RogotGoldberg)
 # -r [] : file containing the logistic regression info
 #          default parameters: penalty='l2', dual=0 (false), C=1.0,
@@ -24,19 +24,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -53,9 +53,9 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path, numpy
+import pickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
-from optparse import OptionParser 
+from optparse import OptionParser
 from sklearn.linear_model import LogisticRegression
 
 # import configuration file with global variables
@@ -105,7 +105,7 @@
     # read in command line options
     (options, args) = parser.parse_args()
     # required arguments
-    if options.num and options.fp: 
+    if options.num and options.fp:
         num_query_mols = options.num
         fp_build = options.fp
     else:
@@ -137,15 +137,16 @@
 
     # loop over data-set sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # read in actives and calculate fps
             actives = []
             for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'):
-                if line[0] != '#': 
+                line=line.decode('UTF-8')
+                if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
                     fp_dict = scor.getFP(fp_build, line[2])
@@ -161,7 +162,8 @@
                 if firstchembl:
                     decoys = []
                     for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'):
-                        if line[0] != '#': 
+                        line=line.decode('UTF-8')
+                        if line[0] != '#':
                             # structure of line: [external ID, internal ID, SMILES]]
                             line = line.rstrip().split()
                             fp_dict = scor.getFP(fp_build, line[2])
@@ -173,7 +175,8 @@
             else:
                 decoys = []
                 for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'):
-                    if line[0] != '#': 
+                    line=line.decode('UTF-8')
+                    if line[0] != '#':
                         # structure of line: [external ID, internal ID, SMILES]]
                         line = line.rstrip().split()
                         fp_dict = scor.getFP(fp_build, line[2])
@@ -182,17 +185,17 @@
                 # convert fps to numpy arrays
                 np_fps_dcy = ml_func.getNumpy(decoys)
             num_decoys = len(decoys)
-            print "molecules read in and fingerprints calculated"
+            print( "molecules read in and fingerprints calculated")
 
             # open training lists
-            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r')
+            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb')
             # to store the scored lists
             scores = defaultdict(list)
 
             # loop over repetitions
             for q in range(conf.num_reps):
-                print q
-                training_list = cPickle.load(training_input)
+                print( q)
+                training_list = pickle.load(training_input)
                 test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]]
                 test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]]
 
@@ -223,6 +226,6 @@
             else:
                 outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format
             for fp in ['lr_'+fp_build]:
-                cPickle.dump([fp, scores[fp]], outfile, 2)
+                pickle.dump([fp, scores[fp]], outfile, 2)
             outfile.close()
-            print "scoring done and scored lists written"
+            print( "scoring done and scored lists written")
diff --git a/scoring/data_sets_I/calculate_scored_lists_RF.py b/scoring/data_sets_I/calculate_scored_lists_RF.py
index 3736d77..74c1c29 100644
--- a/scoring/data_sets_I/calculate_scored_lists_RF.py
+++ b/scoring/data_sets_I/calculate_scored_lists_RF.py
@@ -53,7 +53,7 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path, numpy
+import pickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
 from optparse import OptionParser
 from sklearn.ensemble import RandomForestClassifier, forest
@@ -142,14 +142,15 @@
 
     # loop over data-set sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # read in actives and calculate fps
             actives = []
             for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_actives.dat.gz', 'r'):
+                line=line.decode('UTF-8')
                 if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
@@ -166,6 +167,7 @@
                 if firstchembl:
                     decoys = []
                     for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_zinc_decoys.dat.gz', 'r'):
+                        line=line.decode('UTF-8')
                         if line[0] != '#':
                             # structure of line: [external ID, internal ID, SMILES]]
                             line = line.rstrip().split()
@@ -178,6 +180,7 @@
             else:
                 decoys = []
                 for line in gzip.open(inpath_cmp+dataset+'/cmp_list_'+dataset+'_'+str(target)+'_decoys.dat.gz', 'r'):
+                    line=line.decode('UTF-8')
                     if line[0] != '#':
                         # structure of line: [external ID, internal ID, SMILES]]
                         line = line.rstrip().split()
@@ -187,17 +190,17 @@
                 # convert fps to numpy arrays
                 np_fps_dcy = ml_func.getNumpy(decoys)
             num_decoys = len(decoys)
-            print "molecules read in and fingerprints calculated"
+            print( "molecules read in and fingerprints calculated")
 
             # open training lists
-            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'r')
+            training_input = open(inpath_list+dataset+'/training_'+dataset+'_'+str(target)+'_'+str(num_query_mols)+'.pkl', 'rb')
             # to store the scored lists
             scores = defaultdict(list)
 
             # loop over repetitions
             for q in range(conf.num_reps):
-                print q
-                training_list = cPickle.load(training_input)
+                print( q)
+                training_list = pickle.load(training_input)
                 test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]]
                 test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]]
 
@@ -238,6 +241,6 @@
             else:
                 outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format
             for fp in ['rf_'+fp_build]:
-                cPickle.dump([fp, scores[fp]], outfile, 2)
+                pickle.dump([fp, scores[fp]], outfile, 2)
             outfile.close()
-            print "scoring done and scored lists written"
+            print( "scoring done and scored lists written")
diff --git a/scoring/data_sets_II/calculate_scored_lists_LR.py b/scoring/data_sets_II/calculate_scored_lists_LR.py
index 12df4a8..f1f1e63 100644
--- a/scoring/data_sets_II/calculate_scored_lists_LR.py
+++ b/scoring/data_sets_II/calculate_scored_lists_LR.py
@@ -8,8 +8,8 @@
 # optional:
 # -o [] : relative output path (default: pwd)
 # -a : append to the output file (default: overwrite)
-# -s [] : similarity metric (default: Dice, 
-#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
 #         McConnaughey, Manhattan, RogotGoldberg)
 # -r [] : file containing the logistic regression info
 #          default parameters: penalty='l2', dual=0 (false), C=1.0,
@@ -23,19 +23,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -52,9 +52,9 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path, numpy
+import pickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
-from optparse import OptionParser 
+from optparse import OptionParser
 from sklearn.linear_model import LogisticRegression
 
 # import configuration file with global variables
@@ -103,7 +103,7 @@
     # read in command line options
     (options, args) = parser.parse_args()
     # required arguments
-    if options.fp: 
+    if options.fp:
         fp_build = options.fp
     else:
         raise RuntimeError('one or more of the required options was not given!')
@@ -133,10 +133,10 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # read in training actives and calculate fps
-        actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r'))
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
         for k in actives.keys():
             for i,m in enumerate(actives[k]):
                 fp_dict = scor.getFP(fp_build, m[1])
@@ -145,7 +145,8 @@
         # read in test actives and calculate fps
         div_actives = []
         for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
-            if line[0] != '#': 
+            line=line.decode('UTF-8')
+            if line[0] != '#':
                 # structure of line: [external ID, internal ID, SMILES]]
                 line = line.rstrip().split()
                 fp_dict = scor.getFP(fp_build, line[2])
@@ -159,7 +160,8 @@
         if firstchembl:
             decoys = []
             for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
-                if line[0] != '#': 
+                line=line.decode('UTF-8')
+                if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
                     fp_dict = scor.getFP(fp_build, line[2])
@@ -169,20 +171,20 @@
             np_fps_dcy = ml_func.getNumpy(decoys)
             firstchembl = False
             num_decoys = len(decoys)
-        print "molecules read in and fingerprints calculated"
+        print( "molecules read in and fingerprints calculated")
 
         # open training and test lists
-        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r')
-        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r')
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
         # to store the scored lists
         scores = defaultdict(list)
         # loop over repetitions
         for q in actives.keys():
-            print q
+            print( q)
             num_actives = len(actives[q])
             np_fps_act = ml_func.getNumpy(actives[q])
-            training_list = cPickle.load(training_input)
-            test_list = cPickle.load(test_input)
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
             test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
 
             # list with active/inactive info
@@ -211,6 +213,6 @@
         else:
             outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
         for fp in ['lr_'+fp_build]:
-            cPickle.dump([fp, scores[fp]], outfile, 2)
+            pickle.dump([fp, scores[fp]], outfile, 2)
         outfile.close()
-        print "scoring done and scored lists written"
+        print( "scoring done and scored lists written")
diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py
index a9b7e2b..e40d390 100644
--- a/scoring/data_sets_II/calculate_scored_lists_RF.py
+++ b/scoring/data_sets_II/calculate_scored_lists_RF.py
@@ -52,7 +52,7 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path, numpy
+import pickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
 from optparse import OptionParser
 from sklearn.ensemble import RandomForestClassifier, forest
@@ -129,7 +129,7 @@
     scor.checkSimil(simil_metric)
 
     # default machine-learning method variables
-    ml_dict = dict(criterion='gini', max_features='auto', n_jobs=1, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
+    ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
     if options.ml:
         ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
 
@@ -138,10 +138,10 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # read in training actives and calculate fps
-        actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r'))
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
         for k in actives.keys():
             for i,m in enumerate(actives[k]):
                 fp_dict = scor.getFP(fp_build, m[1])
@@ -150,6 +150,7 @@
         # read in test actives and calculate fps
         div_actives = []
         for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
             if line[0] != '#':
                 # structure of line: [external ID, internal ID, SMILES]]
                 line = line.rstrip().split()
@@ -164,6 +165,7 @@
         if firstchembl:
             decoys = []
             for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
                 if line[0] != '#':
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
@@ -174,20 +176,20 @@
             np_fps_dcy = ml_func.getNumpy(decoys)
             firstchembl = False
             num_decoys = len(decoys)
-        print "molecules read in and fingerprints calculated"
+        print( "molecules read in and fingerprints calculated")
 
         # open training and test lists
-        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r')
-        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r')
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
         # to store the scored lists
         scores = defaultdict(list)
         # loop over repetitions
         for q in actives.keys():
-            print q
+            print( q)
             num_actives = len(actives[q])
             np_fps_act = ml_func.getNumpy(actives[q])
-            training_list = cPickle.load(training_input)
-            test_list = cPickle.load(test_input)
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
             test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
 
             # list with active/inactive info
@@ -226,6 +228,6 @@
         else:
             outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
         for fp in ['rf_'+fp_build]:
-            cPickle.dump([fp, scores[fp]], outfile, 2)
+            pickle.dump([fp, scores[fp]], outfile, 2)
         outfile.close()
-        print "scoring done and scored lists written"
+        print( "scoring done and scored lists written")
diff --git a/scoring/scoring_functions.py b/scoring/scoring_functions.py
index c7bdc6e..a1ec5a6 100644
--- a/scoring/scoring_functions.py
+++ b/scoring/scoring_functions.py
@@ -6,19 +6,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -100,13 +100,13 @@ def getBulkSimilarity(fp, fp_list, simil):
 # helper functions for the fusion
 def printFPs(fps, fpname):
     '''Prints a list of fingerprints'''
-    print "-------------------------------"
-    print "FUSION DONE FOR:"
+    print( "-------------------------------")
+    print( "FUSION DONE FOR:")
     for fp in fps:
-        print fp,
-    print ""
-    print "Name of fusion:", fpname
-    print "-------------------------------"
+        print( fp,)
+    print( "")
+    print( "Name of fusion:", fpname)
+    print( "-------------------------------")
 
 def getName(fp, fp_names):
     '''Determines the new name of a fingerprint in case
diff --git a/validation/data_sets_I/calculate_validation_methods.py b/validation/data_sets_I/calculate_validation_methods.py
index d4402cf..4b2085d 100644
--- a/validation/data_sets_I/calculate_validation_methods.py
+++ b/validation/data_sets_I/calculate_validation_methods.py
@@ -51,7 +51,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, math, sys, os, os.path
+import gzip, pickle, math, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 from rdkit.ML.Scoring import Scoring
@@ -113,22 +113,22 @@
 
     # loop over data-set sources
     for dataset in conf.set_data.keys():
-        print dataset
+        print( dataset)
         # output directory
         outdir = outpath+'/'+dataset
         if not os.path.exists(outdir): os.makedirs(outdir)
 
         # loop over targets
         for target in conf.set_data[dataset]['ids']:
-            print target
+            print( target)
 
             # load scored lists
             scores = {}
             for inp in inpath: # loop over input paths
-                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'r')
+                myfile = gzip.open(inp+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'rb')
                 while 1:
                     try:
-                        tmp = cPickle.load(myfile)
+                        tmp = pickle.load(myfile)
                     except (EOFError):
                         break
                     else:
@@ -137,7 +137,7 @@
                             tmp[0] = vfunc.getName(tmp[0], scores.keys())
                             # input line: [fp_name, list of scored lists]
                             scores[tmp[0]] = tmp[1]
-            print "scored lists read in"
+            print( "scored lists read in")
             if printfp:
                 vfunc.printFPs(scores.keys())
                 printfp = False
@@ -153,11 +153,11 @@
                 for m in method_dict.keys():
                     method_dict[m].runMethod(results, scores, q, -1)
 
-            print "validation methods calculated"
+            print( "validation methods calculated")
 
             # write results
             outf = gzip.open(outdir+'/validation_'+str(target)+'.pkl.gz', 'wb+')
-            cPickle.dump(results, outf, 2)
+            pickle.dump(results, outf, 2)
             outf.close()
 
-            print "results written out"
+            print( "results written out")
diff --git a/validation/data_sets_II/calculate_validation_methods.py b/validation/data_sets_II/calculate_validation_methods.py
index f370c1b..da6b3a6 100644
--- a/validation/data_sets_II/calculate_validation_methods.py
+++ b/validation/data_sets_II/calculate_validation_methods.py
@@ -7,7 +7,7 @@
 #
 # INPUT
 # required:
-# -m [] : file containing the methods 
+# -m [] : file containing the methods
 #         implemented methods are: AUC, BEDROC ([alpha] optional),
 #         RIE ([alpha] optional), EF ([percentage] optional)
 # optional:
@@ -23,19 +23,19 @@
 #
 #  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
 #  All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
-# met: 
+# met:
 #
-#     * Redistributions of source code must retain the above copyright 
+#     * Redistributions of source code must retain the above copyright
 #       notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following 
-#       disclaimer in the documentation and/or other materials provided 
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
 #       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
-#       nor the names of its contributors may be used to endorse or promote 
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
 #       products derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -51,7 +51,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-import gzip, cPickle, math, sys, os, os.path
+import gzip, pickle, math, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser
 from rdkit.ML.Scoring import Scoring
@@ -94,11 +94,11 @@
 
     # optional arguments
     inpath = parentpath+'scoring/'
-    if options.inpath: 
+    if options.inpath:
         inpath = [path+i for i in options.inpath]
         vfunc.checkPaths(inpath)
     outpath = path
-    if options.outpath: 
+    if options.outpath:
         outpath = path+options.outpath
         vfunc.checkPaths([outpath])
     remove_fps = []
@@ -117,15 +117,15 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print( target)
 
         # load scored lists
         scores = {}
         for inp in inpath: # loop over input paths
-            myfile = gzip.open(inp+'/list_'+str(target)+'.pkl.gz', 'r')
+            myfile = gzip.open(inp+'/list_'+str(target)+'.pkl.gz', 'rb')
             while 1:
                 try:
-                    tmp = cPickle.load(myfile)
+                    tmp = pickle.load(myfile)
                 except (EOFError):
                     break
                 else:
@@ -134,7 +134,7 @@
                         tmp[0] = vfunc.getName(tmp[0], scores.keys())
                         # input line: [fp_name, list of scored lists]
                         scores[tmp[0]] = tmp[1]
-        print "scored lists read in"
+        print( "scored lists read in")
         if printfp:
             vfunc.printFPs(scores.keys())
             printfp = False
@@ -145,16 +145,16 @@
             method_dict[m].addNames(results)
 
         # loop of papers
-        for q in range(len(scores[scores.keys()[0]])):
+        for q in range(len(list(scores.values())[0])):
             # loop over evaluation methods
-            for m in method_dict.keys(): 
+            for m in method_dict.keys():
                 method_dict[m].runMethod(results, scores, q, -1)
 
-        print "validation methods calculated"
+        print( "validation methods calculated")
 
         # write results
         outf = gzip.open(outdir+'/validation_'+str(target)+'.pkl.gz', 'wb+')
-        cPickle.dump(results, outf, 2)
+        pickle.dump(results, outf, 2)
         outf.close()
 
-        print "results written out"
+        print( "results written out")
diff --git a/validation/validation_functions.py b/validation/validation_functions.py
index 89fd19f..e8ffd21 100644
--- a/validation/validation_functions.py
+++ b/validation/validation_functions.py
@@ -91,28 +91,28 @@ def readFPs(filepath):
 
 def printInputParam(method_dict, inpath):
     '''Prints the input parameters'''
-    print "-------------------------------"
-    print "PARAMETERS USED"
-    print "Validation methods: "
+    print( "-------------------------------")
+    print( "PARAMETERS USED")
+    print( "Validation methods: ")
     for m in method_dict.keys():
         if isinstance(method_dict[m], ParamEvalMethod):
-            print m, "- parameters:", method_dict[m].params
+            print( m, "- parameters:", method_dict[m].params)
         else:
-            print m
-    print ""
-    print "Input paths:"
+            print( m)
+    print( "")
+    print( "Input paths:")
     for inp in inpath:
-        print inp
-    print "-------------------------------"
+        print( inp)
+    print( "-------------------------------")
 
 def printFPs(fps):
     '''Prints a list of fingerprints'''
-    print "-------------------------------"
-    print "FINGERPRINTS CONSIDERED"
+    print( "-------------------------------")
+    print( "FINGERPRINTS CONSIDERED")
     for fp in fps:
-        print fp,
-    print ""
-    print "-------------------------------"
+        print( "   ",fp)
+    print( "")
+    print( "-------------------------------")
 
 def getName(fp, fp_names):
     '''Determines the new name of a fingerprint in case
@@ -157,12 +157,12 @@ def __init__(self, name, params, factor):
         for p in self.params:
             self.names.append(name + str(int(factor*p)))
     def addNames(self, results):
-        for n in self.names: 
+        for n in self.names:
             results[n] = defaultdict(list)
     def runMethod(self, results, scores, query, index):
         tmp_list = [[] for i in range(len(self.names))]
         # loop over fingerprints
-        for k in scores.keys(): 
+        for k in scores.keys():
             tmp = self.calculate(scores[k][query], index)
             # loop over parameters
             for i in range(len(self.names)):
@@ -193,4 +193,3 @@ def calculate(self, score, index):
         for p in self.params:
             tmp.append(Scoring.CalcRIE(score,index,p))
         return tmp
-

From 4029f33cefd3f20dd3ffa5c787d60c149343cf19 Mon Sep 17 00:00:00 2001
From: Greg Landrum <greg.landrum@gmail.com>
Date: Tue, 10 Oct 2017 05:52:19 +0200
Subject: [PATCH 3/9] add reversible (Crude!)

---
 scoring/fingerprint_lib.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py
index c5a5fda..cd24a07 100644
--- a/scoring/fingerprint_lib.py
+++ b/scoring/fingerprint_lib.py
@@ -11,6 +11,43 @@
 from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory
 from rdkit.Chem import rdMolDescriptors
 
+import pickle
+from rdkit.Chem import BRICS
+
+fragmentList = pickle.load(open('/home/glandrum/Projects/reversible_fingerprints/data/frags.min2.ordered.pkl','rb'))
+from collections import  Counter,defaultdict
+def generateFragmentFingerprint(mol,nBits=4096,fragmentList=fragmentList):
+    frags = BRICS.BRICSDecompose(mol,minFragmentSize=2)
+    res = Counter()
+    for frag in frags:
+        try:
+            idx = fragmentList.index(frag)
+        except ValueError:
+            continue
+        res[idx%nBits] += 1
+    return res
+def FoldedRDKFingerprintCountBased(mol,fpSize=1024,**kwargs):
+    bitInfo = {}
+    unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol,branchedPaths=False,minPath=3,maxPath=3,bitInfo=bitInfo,
+                                                     **kwargs)
+    res = {}
+    newBitInfo = defaultdict(list)
+    for k,b in unfolded.GetNonzeroElements().items():
+        res[k%fpSize] = b
+        newBitInfo[k%fpSize].extend(bitInfo[k])
+    return res,newBitInfo
+
+from rdkit import DataStructs
+def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024):
+    res = DataStructs.UIntSparseIntVect(nFragmentBits+nRDKitBits)
+    fragfp = generateFragmentFingerprint(mol,nBits=nFragmentBits)
+    rdkfp,_ = FoldedRDKFingerprintCountBased(mol,fpSize=nRDKitBits)
+    for bit,count in fragfp.items():
+        res[bit] = count
+    for bit,count in rdkfp.items():
+        res[bit+nFragmentBits] = count
+    return res
+
 # implemented fingerprints:
 # ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs),
 # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt)
@@ -28,6 +65,7 @@
 
 # dictionary
 fpdict = {}
+fpdict['reversible'] = GetReversibleFingerprint
 fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 0, nBits=nbits)
 fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=nbits)
 fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits)

From 84d67e7be18d8d0b9a59a7e97f1c3348b8414751 Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Sat, 26 Nov 2022 05:50:26 +0100
Subject: [PATCH 4/9]  update

---
 scoring/fingerprint_lib.py | 115 ++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 54 deletions(-)

diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py
index cd24a07..ac22b00 100644
--- a/scoring/fingerprint_lib.py
+++ b/scoring/fingerprint_lib.py
@@ -11,42 +11,22 @@
 from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory
 from rdkit.Chem import rdMolDescriptors
 
-import pickle
-from rdkit.Chem import BRICS
 
-fragmentList = pickle.load(open('/home/glandrum/Projects/reversible_fingerprints/data/frags.min2.ordered.pkl','rb'))
-from collections import  Counter,defaultdict
-def generateFragmentFingerprint(mol,nBits=4096,fragmentList=fragmentList):
-    frags = BRICS.BRICSDecompose(mol,minFragmentSize=2)
-    res = Counter()
-    for frag in frags:
-        try:
-            idx = fragmentList.index(frag)
-        except ValueError:
-            continue
-        res[idx%nBits] += 1
-    return res
-def FoldedRDKFingerprintCountBased(mol,fpSize=1024,**kwargs):
+def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs):
     bitInfo = {}
-    unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol,branchedPaths=False,minPath=3,maxPath=3,bitInfo=bitInfo,
+    unfolded = Chem.UnfoldedRDKFingerprintCountBased(mol,
+                                                     branchedPaths=False,
+                                                     minPath=3,
+                                                     maxPath=3,
+                                                     bitInfo=bitInfo,
                                                      **kwargs)
     res = {}
     newBitInfo = defaultdict(list)
-    for k,b in unfolded.GetNonzeroElements().items():
-        res[k%fpSize] = b
-        newBitInfo[k%fpSize].extend(bitInfo[k])
-    return res,newBitInfo
+    for k, b in unfolded.GetNonzeroElements().items():
+        res[k % fpSize] = b
+        newBitInfo[k % fpSize].extend(bitInfo[k])
+    return res, newBitInfo
 
-from rdkit import DataStructs
-def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024):
-    res = DataStructs.UIntSparseIntVect(nFragmentBits+nRDKitBits)
-    fragfp = generateFragmentFingerprint(mol,nBits=nFragmentBits)
-    rdkfp,_ = FoldedRDKFingerprintCountBased(mol,fpSize=nRDKitBits)
-    for bit,count in fragfp.items():
-        res[bit] = count
-    for bit,count in rdkfp.items():
-        res[bit+nFragmentBits] = count
-    return res
 
 # implemented fingerprints:
 # ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs),
@@ -60,45 +40,72 @@ def GetReversibleFingerprint(mol,nFragmentBits=4096,nRDKitBits=1024):
 # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7)
 # 2D pharmacophore (pharm) ?????????????
 
-nbits = 2048
+nbits = 1024
 longbits = 16384
 
 # dictionary
 fpdict = {}
-fpdict['reversible'] = GetReversibleFingerprint
-fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 0, nBits=nbits)
-fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=nbits)
-fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits)
-fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=nbits)
+fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 0, nBits=nbits)
+fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 1, nBits=nbits)
+fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 2, nBits=nbits)
+fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 3, nBits=nbits)
 fpdict['ecfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0)
 fpdict['ecfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1)
 fpdict['ecfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2)
 fpdict['ecfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3)
-fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, useFeatures=True, nBits=nbits)
-fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=nbits)
-fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=nbits)
-fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1, useFeatures=True)
-fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2, useFeatures=True)
-fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3, useFeatures=True)
-fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=longbits)
-fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=longbits)
-fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=longbits)
-fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=longbits)
+fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 1, useFeatures=True, nBits=nbits)
+fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 2, useFeatures=True, nBits=nbits)
+fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 3, useFeatures=True, nBits=nbits)
+fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint(
+    m, 1, useFeatures=True)
+fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint(
+    m, 2, useFeatures=True)
+fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint(
+    m, 3, useFeatures=True)
+fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 2, nBits=longbits)
+fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 3, nBits=longbits)
+fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 2, useFeatures=True, nBits=longbits)
+fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+    m, 3, useFeatures=True, nBits=longbits)
 fpdict['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m)
-fpdict['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m)
-fpdict['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
-fpdict['hashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(m, nBits=nbits)
-fpdict['hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits=nbits)
+fpdict['ap'] = lambda m: rdMolDescriptors.GetAtomPairFingerprint(m)
+fpdict['tt'] = lambda m: rdMolDescriptors.GetTopologicalTorsionFingerprint(m)
+fpdict[
+    'hashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
+        m, nBits=nbits)
+fpdict[
+    'hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
+        m, nBits=nbits)
 fpdict['avalon'] = lambda m: fpAvalon.GetAvalonFP(m, nbits)
 fpdict['laval'] = lambda m: fpAvalon.GetAvalonFP(m, longbits)
-fpdict['rdk5'] = lambda m: Chem.RDKFingerprint(m, maxPath=5, fpSize=nbits, nBitsPerHash=2)
-fpdict['rdk6'] = lambda m: Chem.RDKFingerprint(m, maxPath=6, fpSize=nbits, nBitsPerHash=2)
-fpdict['rdk7'] = lambda m: Chem.RDKFingerprint(m, maxPath=7, fpSize=nbits, nBitsPerHash=2)
+fpdict['rdk5'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=5, fpSize=nbits, nBitsPerHash=2)
+fpdict['rdk6'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=6, fpSize=nbits, nBitsPerHash=2)
+fpdict['rdk7'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=7, fpSize=nbits, nBitsPerHash=2)
+fpdict['lrdk5'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=5, fpSize=longbits, nBitsPerHash=2)
+fpdict['lrdk6'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=6, fpSize=longbits, nBitsPerHash=2)
+fpdict['lrdk7'] = lambda m: Chem.RDKFingerprint(
+    m, maxPath=7, fpSize=longbits, nBitsPerHash=2)
 
 
 def CalculateFP(fp_name, smiles):
     m = Chem.MolFromSmiles(smiles)
     if m is None:
-        raise ValueError('SMILES cannot be converted to a RDKit molecules:', smiles)
+        raise ValueError('SMILES cannot be converted to a RDKit molecules:',
+                         smiles)
 
     return fpdict[fp_name](m)

From cef81bf9f5a725e40960c195ae7c319711e7f45e Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Wed, 30 Nov 2022 04:15:29 +0100
Subject: [PATCH 5/9] seems to work

---
 .../data_sets_II/calculate_scored_lists_RF.py |  10 +-
 .../calculate_scored_lists_XGB.py             | 225 ++++++++++++++++++
 .../calculate_scored_lists_lmnb.py            | 213 +++++++++++++++++
 scoring/fingerprint_lib.py                    |  50 ++--
 scoring/ml_functions_13.py                    |  81 ++++---
 5 files changed, 510 insertions(+), 69 deletions(-)
 create mode 100644 scoring/data_sets_II/calculate_scored_lists_XGB.py
 create mode 100644 scoring/data_sets_II/calculate_scored_lists_lmnb.py

diff --git a/scoring/data_sets_II/calculate_scored_lists_RF.py b/scoring/data_sets_II/calculate_scored_lists_RF.py
index e40d390..21dc3c1 100644
--- a/scoring/data_sets_II/calculate_scored_lists_RF.py
+++ b/scoring/data_sets_II/calculate_scored_lists_RF.py
@@ -55,8 +55,7 @@
 import pickle, gzip, sys, os, os.path, numpy
 from collections import defaultdict
 from optparse import OptionParser
-from sklearn.ensemble import RandomForestClassifier, forest
-from sklearn.tree import tree
+from sklearn.ensemble import RandomForestClassifier
 from rdkit.ML.Data import DataUtils
 from multiprocessing import Pool
 
@@ -85,7 +84,6 @@
 read_dict = {}
 read_dict['criterion'] = lambda x: x
 read_dict['max_depth'] = lambda x: int(x)
-read_dict['max_features'] = lambda x: x
 read_dict['num_estimators'] = lambda x: int(x)
 read_dict['min_samples_split'] = lambda x: int(x)
 read_dict['min_samples_leaf'] = lambda x: int(x)
@@ -99,7 +97,7 @@
 parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with")
 parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)")
 parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg")
-parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, max_features=auto (=sqrt), num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)")
+parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)")
 parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)")
 
 ############# MAIN PART ########################
@@ -129,12 +127,12 @@
     scor.checkSimil(simil_metric)
 
     # default machine-learning method variables
-    ml_dict = dict(criterion='gini', max_features='auto', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
+    ml_dict = dict(criterion='gini', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
     if options.ml:
         ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
 
     # initialize machine-learning method
-    ml = RandomForestClassifier(criterion=ml_dict['criterion'], max_features=ml_dict['max_features'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs'])
+    ml = RandomForestClassifier(criterion=ml_dict['criterion'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs'])
 
     # loop over targets
     for target in conf.set_data:
diff --git a/scoring/data_sets_II/calculate_scored_lists_XGB.py b/scoring/data_sets_II/calculate_scored_lists_XGB.py
new file mode 100644
index 0000000..3e65dad
--- /dev/null
+++ b/scoring/data_sets_II/calculate_scored_lists_XGB.py
@@ -0,0 +1,225 @@
+#
+# calculates fingerprints and scores lists
+# based on the predicted probability
+#
+# INPUT
+# required:
+# -f [] : fingerprint to build the random forest with
+# optional:
+# -o [] : relative output path (default: pwd)
+# -a : append to the output file (default: overwrite)
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
+#         McConnaughey, Manhattan, RogotGoldberg)
+# -r [] : file containing the random forest info
+#          default parameters: criterion=gini, max_depth=10,
+#          max_features=auto (=sqrt), num_estimators=100,
+#          min_samples_split=2, min_samples_leaf=1, n_jobs=1
+# --help : prints usage
+#
+# OUTPUT: for each target in each data set
+#         a file with a list (1 element) of RF prediction
+#         per RF prediction: [name, list of 50 scored lists]
+#
+#  Copyright (c) 2022, Greg Landrum
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+from rdkit import Chem, DataStructs
+import pickle, gzip, sys, os, os.path, numpy
+from collections import defaultdict
+from optparse import OptionParser
+from xgboost import XGBClassifier
+from rdkit.ML.Data import DataUtils
+from multiprocessing import Pool
+
+# import configuration file with global variables
+sys.path.insert(0, os.getcwd()+'/../../')
+import configuration_file_II as conf
+
+# import functions for scoring step
+sys.path.insert(0, os.getcwd()+'/../')
+import scoring_functions as scor
+
+# import ML functions
+import ml_functions_13 as ml_func
+
+# paths
+cwd = os.getcwd()
+parentpath = cwd+'/../../'
+inpath_cmp = parentpath+'compounds/'
+inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/'
+path = cwd+'/'
+
+# flag to read in ChEMBL decoys only once
+firstchembl = True
+
+# dictionary for readMLFile()
+read_dict = {}
+read_dict['max_depth'] = lambda x: int(x)
+read_dict['num_estimators'] = lambda x: int(x)
+read_dict['n_jobs'] = lambda x: int(x)
+
+# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
+
+# prepare command-line option parser
+usage = "usage: %prog [options] arg"
+parser = OptionParser(usage)
+parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with")
+parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)")
+parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg")
+parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)")
+parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)")
+
+############# MAIN PART ########################
+if __name__=='__main__':
+
+    # read in command line options
+    (options, args) = parser.parse_args()
+    # required arguments
+    if options.fp:
+        fp_build = options.fp
+    else:
+        raise RuntimeError('one or more of the required options was not given!')
+
+    # optional arguments
+    do_append = False
+    if options.do_append: do_append = options.do_append
+    simil_metric = 'Dice'
+    if options.simil: simil_metric = options.simil
+    outpath = path
+    outpath_set = False
+    if options.outpath:
+        outpath_set = True
+        outpath = path+options.outpath
+
+    # check for sensible input
+    if outpath_set: scor.checkPath(outpath, 'output')
+    scor.checkSimil(simil_metric)
+
+    # default machine-learning method variables
+    ml_dict = dict( n_jobs=4, max_depth=3, num_estimators=300)
+    if options.ml:
+        ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
+
+    # initialize machine-learning method
+    ml = XGBClassifier(max_depth=ml_dict['max_depth'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs'])
+
+    # loop over targets
+    for target in conf.set_data:
+        print( target)
+
+        # read in training actives and calculate fps
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
+        for k in actives.keys():
+            for i,m in enumerate(actives[k]):
+                fp_dict = scor.getFP(fp_build, m[1])
+                actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict]
+
+        # read in test actives and calculate fps
+        div_actives = []
+        for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
+            if line[0] != '#':
+                # structure of line: [external ID, internal ID, SMILES]]
+                line = line.rstrip().split()
+                fp_dict = scor.getFP(fp_build, line[2])
+                # store: [internal ID, dict with fps]
+                div_actives.append([line[1], fp_dict])
+        num_test_actives = conf.num_div_act - 1
+        # convert fps to numpy arrays
+        np_fps_div_act = ml_func.getNumpy(div_actives)
+
+        # read in decoys and calculate fps
+        if firstchembl:
+            decoys = []
+            for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
+                if line[0] != '#':
+                    # structure of line: [external ID, internal ID, SMILES]]
+                    line = line.rstrip().split()
+                    fp_dict = scor.getFP(fp_build, line[2])
+                    # store: [internal ID, dict with fps]
+                    decoys.append([line[1], fp_dict])
+            # convert fps to numpy arrays
+            np_fps_dcy = ml_func.getNumpy(decoys)
+            firstchembl = False
+            num_decoys = len(decoys)
+        print( "molecules read in and fingerprints calculated")
+
+        # open training and test lists
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
+        # to store the scored lists
+        scores = defaultdict(list)
+        # loop over repetitions
+        for q in actives.keys():
+            print( q)
+            num_actives = len(actives[q])
+            np_fps_act = ml_func.getNumpy(actives[q])
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
+            test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
+
+            # list with active/inactive info
+            ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives)
+            # training fps
+            train_fps = [actives[q][i][1] for i in range(num_actives)]
+            np_train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]]
+            # fit random forest
+            ml.fit(np_train_fps, ys_fit)
+
+            # test fps and molecule info
+            test_fps = [div_actives[i][1] for i in test_list[:num_test_actives]]
+            test_fps += [decoys[i][1] for i in test_list[num_test_actives:]]
+            np_test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]]
+            np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]]
+            test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]]
+            test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]]
+
+            # calculate similarity with standard fp
+            std_simil = []
+            for fp in test_fps:
+                tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric)
+                tmp_simil.sort(reverse=True)
+                std_simil.append(tmp_simil[0])
+
+            # rank based on probability (and second based on similarity)
+            single_score = ml.predict_proba(np_test_fps)
+            # store: [probability, similarity, internal ID, active/inactive]
+            single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)]
+            single_score.sort(reverse=True)
+            scores['xgb_'+fp_build].append(single_score)
+
+        # write scores to file
+        if do_append:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format
+        else:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
+        for fp in ['xgb_'+fp_build]:
+            pickle.dump([fp, scores[fp]], outfile, 2)
+        outfile.close()
+        print( "scoring done and scored lists written")
diff --git a/scoring/data_sets_II/calculate_scored_lists_lmnb.py b/scoring/data_sets_II/calculate_scored_lists_lmnb.py
new file mode 100644
index 0000000..4ce4405
--- /dev/null
+++ b/scoring/data_sets_II/calculate_scored_lists_lmnb.py
@@ -0,0 +1,213 @@
+#
+# calculates fingerprints and scores lists
+# based on the predicted probability
+#
+# INPUT
+# required:
+# -f [] : fingerprint to build the Naive Bayes with
+# optional:
+# -o [] : relative output path (default: pwd)
+# -a : append to the output file (default: overwrite)
+# -s [] : similarity metric (default: Dice, 
+#         other options: Tanimoto, Cosine, Russel, Kulczynski, 
+#         McConnaughey, Manhattan, RogotGoldberg)
+# -r [] : file containing the Naive Bayes info
+#          default parameters: alpha=1.0, binarize=None,
+#          fit_prior=1 (True)
+# --help : prints usage
+#
+# OUTPUT: for each target in each data set
+#         a file with a list (1 element) of NB prediction
+#         per NB prediction: [name, list of 50 scored lists]
+#
+#  Copyright (c) 2022, Greg Landrum
+#  All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: 
+#
+#     * Redistributions of source code must retain the above copyright 
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following 
+#       disclaimer in the documentation and/or other materials provided 
+#       with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+from rdkit import Chem, DataStructs
+import pickle, gzip, sys, os, os.path, numpy
+from collections import defaultdict
+from optparse import OptionParser 
+from bayes.LaplacianNB import LaplacianNB
+
+# import configuration file with global variables
+sys.path.insert(0, os.getcwd()+'/../../')
+import configuration_file_II as conf
+
+# import functions for scoring step
+sys.path.insert(0, os.getcwd()+'/../')
+import scoring_functions as scor
+
+# import ML functions
+import ml_functions_13 as ml_func
+
+# paths
+cwd = os.getcwd()
+parentpath = cwd+'/../../'
+inpath_cmp = parentpath+'compounds/'
+inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/'
+path = cwd+'/'
+
+# flag to read in ChEMBL decoys only once
+firstchembl = True
+
+# dictionary for readMLFile()
+read_dict = {}
+read_dict['alpha'] = lambda x: float(x)
+read_dict['fit_prior'] = lambda x: bool(x)
+
+# prepare command-line option parser
+usage = "usage: %prog [options] arg"
+parser = OptionParser(usage)
+parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train Naive Bayes with")
+parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)")
+parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg")
+parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the  Naive Bayes info (default parameters: alpha=1.0, binarize=None, fit_prior=1 (True))")
+parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)")
+
+############# MAIN PART ########################
+if __name__=='__main__':
+
+    # read in command line options
+    (options, args) = parser.parse_args()
+    # required arguments
+    if options.fp: 
+        fp_build = options.fp
+    else:
+        raise RuntimeError('one or more of the required options was not given!')
+
+    # optional arguments
+    do_append = False
+    if options.do_append: do_append = options.do_append
+    simil_metric = 'Dice'
+    if options.simil: simil_metric = options.simil
+    outpath = path
+    outpath_set = False
+    if options.outpath:
+        outpath_set = True
+        outpath = path+options.outpath
+
+    # check for sensible input
+    if outpath_set: scor.checkPath(outpath, 'output')
+    scor.checkSimil(simil_metric)
+
+    # default machine-learning method variables
+    ml_dict = dict(alpha=1.0, fit_prior=True)
+    if options.ml:
+        ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
+
+    # initialize machine-learning method
+    ml = LaplacianNB(alpha=ml_dict['alpha'], fit_prior=ml_dict['fit_prior'])
+
+    # loop over targets
+    for target in conf.set_data:
+        print(target)
+
+        # read in training actives and calculate fps
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
+        for k in actives.keys():
+            for i,m in enumerate(actives[k]):
+                fp_dict = scor.getFP(fp_build, m[1])
+                actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict]
+
+        # read in test actives and calculate fps
+        div_actives = []
+        for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
+            if line[0] != '#': 
+                # structure of line: [external ID, internal ID, SMILES]]
+                line = line.rstrip().split()
+                fp_dict = scor.getFP(fp_build, line[2])
+                # store: [internal ID, dict with fps]
+                div_actives.append([line[1], fp_dict])
+        num_test_actives = conf.num_div_act - 1
+        # convert fps to numpy arrays
+        np_fps_div_act = ml_func.getNumpy(div_actives,dtyp=int)
+
+        # read in decoys and calculate fps
+        if firstchembl:
+            decoys = []
+            for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
+                if line[0] != '#': 
+                    # structure of line: [external ID, internal ID, SMILES]]
+                    line = line.rstrip().split()
+                    fp_dict = scor.getFP(fp_build, line[2])
+                    # store: [internal ID, dict with fps]
+                    decoys.append([line[1], fp_dict])
+            # convert fps to numpy arrays
+            np_fps_dcy = ml_func.getNumpy(decoys,dtyp=int)
+            firstchembl = False
+            num_decoys = len(decoys)
+        print("molecules read in and fingerprints calculated")
+
+        # open training and test lists
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
+        # to store the scored lists
+        scores = defaultdict(list)
+        # loop over repetitions
+        for q in actives.keys():
+            print(q)
+            num_actives = len(actives[q])
+            np_fps_act = ml_func.getNumpy(actives[q],dtyp=int)
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
+            test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
+
+            # list with active/inactive info
+            ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives)
+            # training fps
+            train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]]
+            # lmnb wants sets of on-bits
+            train_fps = [set([i for i,x in enumerate(fp) if x]) for fp in train_fps]
+            # fit Naive Bayes
+            ml.fit(train_fps, ys_fit)
+
+            # test fps and molecule info
+            test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]]
+            test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]]
+            test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]]
+            test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]]
+
+            test_fps = [set([i for i,x in enumerate(fp) if x]) for fp in test_fps]
+            # rank based on probability
+            single_score = ml.predict_proba(test_fps)
+            # store: [probability, internal ID, active/inactive]
+            single_score = [[s[1], m[0], m[1]] for s,m in zip(single_score, test_mols)]
+            single_score.sort(reverse=True)
+            scores['lmnb_'+fp_build].append(single_score)
+
+        # write scores to file
+        if do_append:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format
+        else:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
+        for fp in ['lmnb_'+fp_build]:
+            pickle.dump([fp, scores[fp]], outfile, 2)
+        outfile.close()
+        print("scoring done and scored lists written")
+        #break
diff --git a/scoring/fingerprint_lib.py b/scoring/fingerprint_lib.py
index ac22b00..56b9fc7 100644
--- a/scoring/fingerprint_lib.py
+++ b/scoring/fingerprint_lib.py
@@ -29,14 +29,14 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs):
 
 
 # implemented fingerprints:
-# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs),
+# mfc0 (mfc0), mfp0 (mfp0), MACCS (maccs),
 # atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt)
 # hashed atom pairs (hashap), hashed topological torsions (hashtt) --> with 1024 bits
-# ECFP4 (ecfp4), ECFP6 (ecfp6), ECFC4 (ecfc4), ECFC6 (ecfc6) --> with 1024 bits
-# FCFP4 (fcfp4), FCFP6 (fcfp6), FCFC4 (fcfc4), FCFC6 (fcfc6) --> with 1024 bits
+# mfp2 (mfp2), mfp3 (mfp3), mfc2 (mfc2), mfc3 (mfc3) --> with 1024 bits
+# fmfp2 (fmfp2), fmfp3 (fmfp3), fmfc2 (fmfc2), fmfc3 (fmfc3) --> with 1024 bits
 # Avalon (avalon) --> with 1024 bits
 # long Avalon (laval) --> with 16384 bits
-# long ECFP4 (lecfp4), long ECFP6 (lecfp6), long FCFP4 (lfcfp4), long FCFP6 (lfcfp6) --> with 16384 bits
+# long mfp2 (lmfp2), long mfp3 (lmfp3), long fmfp2 (lfmfp2), long fmfp3 (lfmfp3) --> with 16384 bits
 # RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7)
 # 2D pharmacophore (pharm) ?????????????
 
@@ -45,37 +45,37 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs):
 
 # dictionary
 fpdict = {}
-fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['mfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 0, nBits=nbits)
-fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['mfp1'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 1, nBits=nbits)
-fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['mfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 2, nBits=nbits)
-fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['mfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 3, nBits=nbits)
-fpdict['ecfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0)
-fpdict['ecfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1)
-fpdict['ecfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2)
-fpdict['ecfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3)
-fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['mfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0)
+fpdict['mfc1'] = lambda m: AllChem.GetMorganFingerprint(m, 1)
+fpdict['mfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 2)
+fpdict['mfc3'] = lambda m: AllChem.GetMorganFingerprint(m, 3)
+fpdict['fmfp1'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 1, useFeatures=True, nBits=nbits)
-fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['fmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 2, useFeatures=True, nBits=nbits)
-fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['fmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 3, useFeatures=True, nBits=nbits)
-fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint(
+fpdict['fmfc1'] = lambda m: AllChem.GetMorganFingerprint(
     m, 1, useFeatures=True)
-fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint(
+fpdict['fmfc2'] = lambda m: AllChem.GetMorganFingerprint(
     m, 2, useFeatures=True)
-fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint(
+fpdict['fmfc3'] = lambda m: AllChem.GetMorganFingerprint(
     m, 3, useFeatures=True)
-fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['lmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 2, nBits=longbits)
-fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['lmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 3, nBits=longbits)
-fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['lfmfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 2, useFeatures=True, nBits=longbits)
-fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
+fpdict['lfmfp3'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
     m, 3, useFeatures=True, nBits=longbits)
 fpdict['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m)
 fpdict['ap'] = lambda m: rdMolDescriptors.GetAtomPairFingerprint(m)
@@ -86,6 +86,12 @@ def FoldedRDKFingerprintCountBased(mol, fpSize=1024, **kwargs):
 fpdict[
     'hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
         m, nBits=nbits)
+fpdict[
+    'lhashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
+        m, nBits=longbits)
+fpdict[
+    'lhashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
+        m, nBits=longbits)
 fpdict['avalon'] = lambda m: fpAvalon.GetAvalonFP(m, nbits)
 fpdict['laval'] = lambda m: fpAvalon.GetAvalonFP(m, longbits)
 fpdict['rdk5'] = lambda m: Chem.RDKFingerprint(
diff --git a/scoring/ml_functions_13.py b/scoring/ml_functions_13.py
index d26ed6a..5fa077b 100644
--- a/scoring/ml_functions_13.py
+++ b/scoring/ml_functions_13.py
@@ -37,59 +37,58 @@
 from rdkit.ML.Data import DataUtils
 import numpy
 from multiprocessing import Pool
-from sklearn.ensemble import RandomForestClassifier, forest
-from sklearn.tree import tree
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.naive_bayes import BernoulliNB
 
-### FOR SKLEARN VERSION 0.13 ###
+# ### FOR SKLEARN VERSION 0.13 ###
 
-# HELPER FUNCTIONS FOR RANDOM FOREST
-def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose):
-    """Private function used to build a batch of trees within a job"""
-    from sklearn.utils import check_random_state
-    from sklearn.utils.fixes import bincount
-    import random
-    MAX_INT = numpy.iinfo(numpy.int32).max
-    random_state = check_random_state(seed)
+# # HELPER FUNCTIONS FOR RANDOM FOREST
+# def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose):
+#     """Private function used to build a batch of trees within a job"""
+#     from sklearn.utils import check_random_state
+#     from sklearn.utils.fixes import bincount
+#     import random
+#     MAX_INT = numpy.iinfo(numpy.int32).max
+#     random_state = check_random_state(seed)
 
-    trees = []
-    for i in xrange(n_trees):
-        if verbose > 1:
-            print("building tree %d of %d" % (i+1, n_trees))
-        seed = random_state.randint(MAX_INT)
+#     trees = []
+#     for i in xrange(n_trees):
+#         if verbose > 1:
+#             print("building tree %d of %d" % (i+1, n_trees))
+#         seed = random_state.randint(MAX_INT)
 
-        tree = forest._make_estimator(append = False)
-        tree.set_params(compute_importances=forest.compute_importances)
-        tree.set_params(random_state = check_random_state(seed))
+#         tree = forest._make_estimator(append = False)
+#         tree.set_params(compute_importances=forest.compute_importances)
+#         tree.set_params(random_state = check_random_state(seed))
 
-        if forest.bootstrap:
-            n_samples = X.shape[0]
-            if sample_weight is None:
-                curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64)
-            else:
-                curr_sample_weight = sample_weight.copy()
+#         if forest.bootstrap:
+#             n_samples = X.shape[0]
+#             if sample_weight is None:
+#                 curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64)
+#             else:
+#                 curr_sample_weight = sample_weight.copy()
 
-            ty = list(enumerate(y))
-            indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0]
-            indices2 = random_state.randint(0, len(indices), len(indices))
-            indices = [indices[j] for j in indices2]
-            sample_counts = bincount(indices, minlength=n_samples)
+#             ty = list(enumerate(y))
+#             indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0]
+#             indices2 = random_state.randint(0, len(indices), len(indices))
+#             indices = [indices[j] for j in indices2]
+#             sample_counts = bincount(indices, minlength=n_samples)
 
-            curr_sample_weight *= sample_counts
-            curr_sample_mask = sample_mask.copy()
-            curr_sample_mask[sample_counts==0] = False
+#             curr_sample_weight *= sample_counts
+#             curr_sample_mask = sample_mask.copy()
+#             curr_sample_mask[sample_counts==0] = False
 
-            tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False)
-            tree.indices = curr_sample_mask
-        else:
-            tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False)
-        trees.append(tree)
-    return trees
+#             tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False)
+#             tree.indices = curr_sample_mask
+#         else:
+#             tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False)
+#         trees.append(tree)
+#     return trees
 
-def getNumpy(inlist):
+def getNumpy(inlist,dtyp=float):
     outlist = []
     for i in inlist:
-        arr = numpy.zeros((3,), tree.DTYPE)
+        arr = numpy.zeros((3,), dtyp)
         DataStructs.ConvertToNumpyArray(i[1], arr)
         outlist.append(arr)
     return outlist

From 23e826d2948dfb669e3731255e5038ea8d357edb Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Wed, 30 Nov 2022 04:47:55 +0100
Subject: [PATCH 6/9] additional validation funcs

---
 validation/validation_functions.py | 105 ++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 31 deletions(-)

diff --git a/validation/validation_functions.py b/validation/validation_functions.py
index e8ffd21..a1b1861 100644
--- a/validation/validation_functions.py
+++ b/validation/validation_functions.py
@@ -37,6 +37,7 @@
 import os
 from collections import defaultdict
 from rdkit.ML.Scoring import Scoring
+from sklearn.metrics import roc_auc_score, average_precision_score
 
 
 def checkPaths(filepaths):
@@ -45,21 +46,27 @@ def checkPaths(filepaths):
         if not os.path.exists(f):
             raise IOError('path does not exist:', f)
 
+
 def _readMethods(line):
     '''Helper function for readMethods()'''
-    if line: # if params are provided
+    if line:  # if params are provided
         params = []
-        for i in line: params.append(float(i))
+        for i in line:
+            params.append(float(i))
     else:
         raise ValueError("Method requires parameters.")
     return params
 
+
 # dictionary for readMethods()
 read_dict = {}
 read_dict['AUC'] = lambda l: EvalMethod(l[0])
 read_dict['EF'] = lambda l: EFMethod(l[0], _readMethods(l[1:]), 100)
 read_dict['BEDROC'] = lambda l: BEDROCMethod(l[0], _readMethods(l[1:]), 1)
 read_dict['RIE'] = lambda l: RIEMethod(l[0], _readMethods(l[1:]), 1)
+read_dict['AUROC'] = lambda l: AUROCEvalMethod(l[0])
+read_dict['AUPRC'] = lambda l: AUPRCEvalMethod(l[0])
+
 
 def readMethods(filepath):
     '''Reads the methods names and parameters from a file'''
@@ -70,11 +77,12 @@ def readMethods(filepath):
     else:
         method_dict = {}
         for line in myfile:
-            if line[0] != "#": # ignore comments
+            if line[0] != "#":  # ignore comments
                 line = line.rstrip().split()
                 method_dict[line[0]] = read_dict[line[0]](line)
         return method_dict
 
+
 def readFPs(filepath):
     '''Reads a list of fingerprints from a file'''
     try:
@@ -84,35 +92,38 @@ def readFPs(filepath):
     else:
         fps = []
         for line in myfile:
-            if line[0] != "#": # ignore comments
+            if line[0] != "#":  # ignore comments
                 line = line.rstrip().split()
                 fps.append(line[0])
         return fps
 
+
 def printInputParam(method_dict, inpath):
     '''Prints the input parameters'''
-    print( "-------------------------------")
-    print( "PARAMETERS USED")
-    print( "Validation methods: ")
+    print("-------------------------------")
+    print("PARAMETERS USED")
+    print("Validation methods: ")
     for m in method_dict.keys():
         if isinstance(method_dict[m], ParamEvalMethod):
-            print( m, "- parameters:", method_dict[m].params)
+            print(m, "- parameters:", method_dict[m].params)
         else:
-            print( m)
-    print( "")
-    print( "Input paths:")
+            print(m)
+    print("")
+    print("Input paths:")
     for inp in inpath:
-        print( inp)
-    print( "-------------------------------")
+        print(inp)
+    print("-------------------------------")
+
 
 def printFPs(fps):
     '''Prints a list of fingerprints'''
-    print( "-------------------------------")
-    print( "FINGERPRINTS CONSIDERED")
+    print("-------------------------------")
+    print("FINGERPRINTS CONSIDERED")
     for fp in fps:
-        print( "   ",fp)
-    print( "")
-    print( "-------------------------------")
+        print("   ", fp)
+    print("")
+    print("-------------------------------")
+
 
 def getName(fp, fp_names):
     '''Determines the new name of a fingerprint in case
@@ -120,34 +131,61 @@ def getName(fp, fp_names):
     # check if fp already exists. if yes, add a number
     if fp in fp_names:
         suffix = 2
-        tmp_name = fp+'_'+str(suffix)
+        tmp_name = fp + '_' + str(suffix)
         while tmp_name in fp_names:
             suffix += 1
-            tmp_name = fp+'_'+str(suffix)
+            tmp_name = fp + '_' + str(suffix)
         return tmp_name
     else:
         return fp
 
+
 # class for handling of evaluation methods
 class EvalMethod:
     def __init__(self, name):
         self.method_name = name
         self.names = name
+
     def addNames(self, results):
         results[self.method_name] = defaultdict(list)
+
     def calculate(self, score, index):
-        return Scoring.CalcAUC(score,index)
+        return Scoring.CalcAUC(score, index)
+
     def runMethod(self, results, scores, query, index):
         tmp_list = []
-        for k in scores.keys(): # fingerprints
+        for k in scores.keys():  # fingerprints
             tmp = self.calculate(scores[k][query], index)
             tmp_list.append([tmp, k])
         # sort list according to the descending score
         tmp_list.sort(reverse=True)
         # store [score, rank]
-        for i,l in enumerate(tmp_list):
+        for i, l in enumerate(tmp_list):
             # l[1] = fp, l[0] = score, i+1 = rank
-            results[self.method_name][l[1]].append([l[0], i+1])
+            results[self.method_name][l[1]].append([l[0], i + 1])
+
+
+class AUROCEvalMethod(EvalMethod):
+    def __init__(self, name):
+        self.method_name = name
+        self.names = name
+
+    def calculate(self, score, index):
+        scores = [x[0] for x in score]
+        acts = [x[index] for x in score]
+        return roc_auc_score(acts, scores)
+
+
+class AUPRCEvalMethod(EvalMethod):
+    def __init__(self, name):
+        self.method_name = name
+        self.names = name
+
+    def calculate(self, score, index):
+        scores = [x[0] for x in score]
+        acts = [x[index] for x in score]
+        return average_precision_score(acts, scores)
+
 
 class ParamEvalMethod(EvalMethod):
     def __init__(self, name, params, factor):
@@ -155,10 +193,12 @@ def __init__(self, name, params, factor):
         self.params = params
         self.names = []
         for p in self.params:
-            self.names.append(name + str(int(factor*p)))
+            self.names.append(name + str(int(factor * p)))
+
     def addNames(self, results):
         for n in self.names:
             results[n] = defaultdict(list)
+
     def runMethod(self, results, scores, query, index):
         tmp_list = [[] for i in range(len(self.names))]
         # loop over fingerprints
@@ -168,28 +208,31 @@ def runMethod(self, results, scores, query, index):
             for i in range(len(self.names)):
                 tmp_list[i].append([tmp[i], k])
         # loop over parameters
-        for i,n in enumerate(self.names):
+        for i, n in enumerate(self.names):
             # sort list according to the descending score
             tmp_list[i].sort(reverse=True)
             # store [score, rank]
-            for j,l in enumerate(tmp_list[i]):
+            for j, l in enumerate(tmp_list[i]):
                 # l[1] = fp, l[0] = score, j+1 = rank
-                results[n][l[1]].append([l[0], j+1])
+                results[n][l[1]].append([l[0], j + 1])
+
 
 class EFMethod(ParamEvalMethod):
     def calculate(self, score, index):
-        return Scoring.CalcEnrichment(score,index,self.params)
+        return Scoring.CalcEnrichment(score, index, self.params)
+
 
 class BEDROCMethod(ParamEvalMethod):
     def calculate(self, score, index):
         tmp = []
         for p in self.params:
-            tmp.append(Scoring.CalcBEDROC(score,index,p))
+            tmp.append(Scoring.CalcBEDROC(score, index, p))
         return tmp
 
+
 class RIEMethod(ParamEvalMethod):
     def calculate(self, score, index):
         tmp = []
         for p in self.params:
-            tmp.append(Scoring.CalcRIE(score,index,p))
+            tmp.append(Scoring.CalcRIE(score, index, p))
         return tmp

From a1225b08904cd7775690af55288a400522841cf1 Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Sun, 18 Dec 2022 11:17:14 +0100
Subject: [PATCH 7/9] add BRF scorer

---
 .../calculate_scored_lists_BRF.py             | 231 ++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 scoring/data_sets_II/calculate_scored_lists_BRF.py

diff --git a/scoring/data_sets_II/calculate_scored_lists_BRF.py b/scoring/data_sets_II/calculate_scored_lists_BRF.py
new file mode 100644
index 0000000..84ddcf0
--- /dev/null
+++ b/scoring/data_sets_II/calculate_scored_lists_BRF.py
@@ -0,0 +1,231 @@
+#
+# calculates fingerprints and scores lists
+# based on the predicted probability
+#
+# INPUT
+# required:
+# -f [] : fingerprint to build the balanced random forest with
+# optional:
+# -o [] : relative output path (default: pwd)
+# -a : append to the output file (default: overwrite)
+# -s [] : similarity metric (default: Dice,
+#         other options: Tanimoto, Cosine, Russel, Kulczynski,
+#         McConnaughey, Manhattan, RogotGoldberg)
+# -r [] : file containing the random forest info
+#          default parameters: criterion=gini, max_depth=10,
+#          max_features=auto (=sqrt), num_estimators=100,
+#          min_samples_split=2, min_samples_leaf=1, n_jobs=1
+# --help : prints usage
+#
+# OUTPUT: for each target in each data set
+#         a file with a list (1 element) of RF prediction
+#         per RF prediction: [name, list of 50 scored lists]
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+from rdkit import Chem, DataStructs
+import pickle, gzip, sys, os, os.path, numpy
+from collections import defaultdict
+from optparse import OptionParser
+from imblearn.ensemble import BalancedRandomForestClassifier
+from rdkit.ML.Data import DataUtils
+from multiprocessing import Pool
+
+# import configuration file with global variables
+sys.path.insert(0, os.getcwd()+'/../../')
+import configuration_file_II as conf
+
+# import functions for scoring step
+sys.path.insert(0, os.getcwd()+'/../')
+import scoring_functions as scor
+
+# import ML functions
+import ml_functions_13 as ml_func
+
+# paths
+cwd = os.getcwd()
+parentpath = cwd+'/../../'
+inpath_cmp = parentpath+'compounds/'
+inpath_list = parentpath+'query_lists/data_sets_II/ChEMBL/'
+path = cwd+'/'
+
+# flag to read in ChEMBL decoys only once
+firstchembl = True
+
+# dictionary for readMLFile()
+read_dict = {}
+read_dict['criterion'] = lambda x: x
+read_dict['max_depth'] = lambda x: int(x)
+read_dict['num_estimators'] = lambda x: int(x)
+read_dict['min_samples_split'] = lambda x: int(x)
+read_dict['min_samples_leaf'] = lambda x: int(x)
+read_dict['n_jobs'] = lambda x: int(x)
+
+# forest._parallel_build_trees = ml_func._balanced_parallel_build_trees
+
+# prepare command-line option parser
+usage = "usage: %prog [options] arg"
+parser = OptionParser(usage)
+parser.add_option("-f", "--fingerprint", dest="fp", help="fingerprint to train random forest with")
+parser.add_option("-o", "--outpath", dest="outpath", metavar="PATH", help="relative output PATH (default: pwd)")
+parser.add_option("-s", "--similarity", dest="simil", type="string", metavar="NAME", help="NAME of similarity metric to use (default: Dice, other options are: Tanimoto, Cosine, Russel, Kulczynski, McConnaughey, Manhattan, RogotGoldberg")
+parser.add_option("-m", "--ml", dest="ml", metavar="FILE", help="file containing the random forest info (default parameters: criterion=gini, max_depth=10, num_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs=1)")
+parser.add_option("-a", "--append", dest="do_append", action="store_true", help="append to the output file (default: False)")
+
+############# MAIN PART ########################
+if __name__=='__main__':
+
+    # read in command line options
+    (options, args) = parser.parse_args()
+    # required arguments
+    if options.fp:
+        fp_build = options.fp
+    else:
+        raise RuntimeError('one or more of the required options was not given!')
+
+    # optional arguments
+    do_append = False
+    if options.do_append: do_append = options.do_append
+    simil_metric = 'Dice'
+    if options.simil: simil_metric = options.simil
+    outpath = path
+    outpath_set = False
+    if options.outpath:
+        outpath_set = True
+        outpath = path+options.outpath
+
+    # check for sensible input
+    if outpath_set: scor.checkPath(outpath, 'output')
+    scor.checkSimil(simil_metric)
+
+    # default machine-learning method variables
+    ml_dict = dict(criterion='gini', n_jobs=4, max_depth=10, min_samples_split=2, min_samples_leaf=1, num_estimators=100)
+    if options.ml:
+        ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml)
+
+    # initialize machine-learning method
+    ml = BalancedRandomForestClassifier(criterion=ml_dict['criterion'], min_samples_split=ml_dict['min_samples_split'], max_depth=ml_dict['max_depth'], min_samples_leaf=ml_dict['min_samples_leaf'], n_estimators=ml_dict['num_estimators'], n_jobs=ml_dict['n_jobs'])
+
+    # loop over targets
+    for target in conf.set_data:
+        print( target)
+
+        # read in training actives and calculate fps
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
+        for k in actives.keys():
+            for i,m in enumerate(actives[k]):
+                fp_dict = scor.getFP(fp_build, m[1])
+                actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict]
+
+        # read in test actives and calculate fps
+        div_actives = []
+        for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
+            if line[0] != '#':
+                # structure of line: [external ID, internal ID, SMILES]]
+                line = line.rstrip().split()
+                fp_dict = scor.getFP(fp_build, line[2])
+                # store: [internal ID, dict with fps]
+                div_actives.append([line[1], fp_dict])
+        num_test_actives = conf.num_div_act - 1
+        # convert fps to numpy arrays
+        np_fps_div_act = ml_func.getNumpy(div_actives)
+
+        # read in decoys and calculate fps
+        if firstchembl:
+            decoys = []
+            for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
+                if line[0] != '#':
+                    # structure of line: [external ID, internal ID, SMILES]]
+                    line = line.rstrip().split()
+                    fp_dict = scor.getFP(fp_build, line[2])
+                    # store: [internal ID, dict with fps]
+                    decoys.append([line[1], fp_dict])
+            # convert fps to numpy arrays
+            np_fps_dcy = ml_func.getNumpy(decoys)
+            firstchembl = False
+            num_decoys = len(decoys)
+        print( "molecules read in and fingerprints calculated")
+
+        # open training and test lists
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
+        # to store the scored lists
+        scores = defaultdict(list)
+        # loop over repetitions
+        for q in actives.keys():
+            print( q)
+            num_actives = len(actives[q])
+            np_fps_act = ml_func.getNumpy(actives[q])
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
+            test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
+
+            # list with active/inactive info
+            ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives)
+            # training fps
+            train_fps = [actives[q][i][1] for i in range(num_actives)]
+            np_train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]]
+            # fit random forest
+            ml.fit(np_train_fps, ys_fit)
+
+            # test fps and molecule info
+            test_fps = [div_actives[i][1] for i in test_list[:num_test_actives]]
+            test_fps += [decoys[i][1] for i in test_list[num_test_actives:]]
+            np_test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]]
+            np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]]
+            test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]]
+            test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]]
+
+            # calculate similarity with standard fp
+            std_simil = []
+            for fp in test_fps:
+                tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric)
+                tmp_simil.sort(reverse=True)
+                std_simil.append(tmp_simil[0])
+
+            # rank based on probability (and second based on similarity)
+            single_score = ml.predict_proba(np_test_fps)
+            # store: [probability, similarity, internal ID, active/inactive]
+            single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)]
+            single_score.sort(reverse=True)
+            scores['brf_'+fp_build].append(single_score)
+
+        # write scores to file
+        if do_append:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format
+        else:
+            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
+        for fp in ['brf_'+fp_build]:
+            pickle.dump([fp, scores[fp]], outfile, 2)
+        outfile.close()
+        print( "scoring done and scored lists written")

From 7f6df3cac7dd023be659863e9a2fa27bc3cccb3f Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Sun, 18 Dec 2022 11:17:26 +0100
Subject: [PATCH 8/9] update NB scorer

---
 .../data_sets_II/calculate_scored_lists_NB.py | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/scoring/data_sets_II/calculate_scored_lists_NB.py b/scoring/data_sets_II/calculate_scored_lists_NB.py
index 12669ed..d59d21f 100644
--- a/scoring/data_sets_II/calculate_scored_lists_NB.py
+++ b/scoring/data_sets_II/calculate_scored_lists_NB.py
@@ -51,7 +51,8 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path, numpy
+import pickle, gzip, sys, os, os.path
+import numpy as np
 from collections import defaultdict
 from optparse import OptionParser 
 from sklearn.naive_bayes import BernoulliNB
@@ -128,10 +129,10 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print(target)
 
         # read in training actives and calculate fps
-        actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r'))
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
         for k in actives.keys():
             for i,m in enumerate(actives[k]):
                 fp_dict = scor.getFP(fp_build, m[1])
@@ -140,6 +141,7 @@
         # read in test actives and calculate fps
         div_actives = []
         for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
             if line[0] != '#': 
                 # structure of line: [external ID, internal ID, SMILES]]
                 line = line.rstrip().split()
@@ -154,6 +156,7 @@
         if firstchembl:
             decoys = []
             for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
                 if line[0] != '#': 
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
@@ -164,20 +167,20 @@
             np_fps_dcy = ml_func.getNumpy(decoys)
             firstchembl = False
             num_decoys = len(decoys)
-        print "molecules read in and fingerprints calculated"
+        print("molecules read in and fingerprints calculated")
 
         # open training and test lists
-        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r')
-        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r')
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
         # to store the scored lists
         scores = defaultdict(list)
         # loop over repetitions
         for q in actives.keys():
-            print q
+            print(q)
             num_actives = len(actives[q])
             np_fps_act = ml_func.getNumpy(actives[q])
-            training_list = cPickle.load(training_input)
-            test_list = cPickle.load(test_input)
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
             test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
 
             # list with active/inactive info
@@ -206,6 +209,6 @@
         else:
             outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
         for fp in ['nb_'+fp_build]:
-            cPickle.dump([fp, scores[fp]], outfile, 2)
+            pickle.dump([fp, scores[fp]], outfile, 2)
         outfile.close()
-        print "scoring done and scored lists written"
+        print("scoring done and scored lists written")

From 59fd09ba1049ff137ba81f3b4e77b4b85ed30072 Mon Sep 17 00:00:00 2001
From: greg landrum <greg.landrum@gmail.com>
Date: Mon, 19 Dec 2022 04:31:55 +0100
Subject: [PATCH 9/9] basics

---
 .../data_sets_II/calculate_scored_lists.py    | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/scoring/data_sets_II/calculate_scored_lists.py b/scoring/data_sets_II/calculate_scored_lists.py
index 06a0deb..bb196ed 100644
--- a/scoring/data_sets_II/calculate_scored_lists.py
+++ b/scoring/data_sets_II/calculate_scored_lists.py
@@ -48,7 +48,7 @@
 #
 
 from rdkit import Chem, DataStructs
-import cPickle, gzip, sys, os, os.path
+import pickle, gzip, sys, os, os.path
 from collections import defaultdict
 from optparse import OptionParser 
 
@@ -108,10 +108,10 @@
 
     # loop over targets
     for target in conf.set_data:
-        print target
+        print(target)
 
         # read in training actives and calculate fps
-        actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r'))
+        actives = pickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'rb'))
         for k in actives.keys():
             for i,m in enumerate(actives[k]):
                 fp_dict = scor.getFPDict(fp_names, m[1])
@@ -120,6 +120,7 @@
         # read in test actives and calculate fps
         div_actives = []
         for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
+            line=line.decode('UTF-8')
             if line[0] != '#': 
                 # structure of line: [external ID, internal ID, SMILES]]
                 line = line.rstrip().split()
@@ -132,6 +133,7 @@
         if firstchembl:
             decoys = []
             for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
+                line=line.decode('UTF-8')
                 if line[0] != '#': 
                     # structure of line: [external ID, internal ID, SMILES]]
                     line = line.rstrip().split()
@@ -140,19 +142,19 @@
                     decoys.append([line[1], fp_dict])
             firstchembl = False
             num_decoys = len(decoys)
-        print "molecules read in and fingerprints calculated"
+        print("molecules read in and fingerprints calculated")
 
         # open training and test lists
-        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'r')
-        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'r')
+        training_input = open(inpath_list+'/training_'+str(target)+'.pkl', 'rb')
+        test_input = open(inpath_list+'/test_'+str(target)+'.pkl', 'rb')
         # to store the scored lists
         scores = defaultdict(list)
 
         # loop over papers
         for q in actives.keys():
             num_actives = len(actives[q])
-            training_list = cPickle.load(training_input)
-            test_list = cPickle.load(test_input)
+            training_list = pickle.load(training_input)
+            test_list = pickle.load(test_input)
             test_list += [i for i in range(num_decoys) if i not in training_list[num_actives:]]
             # loop over fps
             single_score = defaultdict(list)
@@ -175,6 +177,6 @@
         else:
             outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
         for fp in fp_names:
-            cPickle.dump([fp, scores[fp]], outfile, 2)
+            pickle.dump([fp, scores[fp]], outfile, 2)
         outfile.close()
-        print "scoring done and scored lists written"
+        print("scoring done and scored lists written")