import os
import csv
import collections
We use the pubchempy package to query PubChem's API within python. Information on pubchempy is available at the:
import pubchempy
def get_pubchem_parent(cid, orphans_as_self=True):
"""
From a pubchem_cid, retreive the parent compound's cid.
If function is unsuccesful in retrieving a single parent,
`orphans_as_self = True` returns `cid` rather than None.
According to pubmed:
> A parent is conceptually the "important" part of the molecule
> when the molecule has more than one covalent component.
> Specifically, a parent component must have at least one carbon
> and contain at least 70% of the heavy (non-hydrogen) atoms of
> all the unique covalent units (ignoring stoichiometry).
> Note that this is a very empirical definition and is subject to change.
A parallel query can be executed using the REST PUG API:
http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/11477084/cids/XML?cids_type=parent
"""
assert cid
try:
parent_cids = pubchempy.get_cids(identifier=cid, namespace='cid', domain='compound', cids_type='parent')
except pubchempy.BadRequestError as e:
print 'Error getting parent of {}. {}'.format(cid, e)
return cid if orphans_as_self else None
try:
parent_cid, = parent_cids
return parent_cid
except ValueError:
print 'Error getting parent of {}. Parents retreived: {}'.format(cid, parent_cids)
return cid if orphans_as_self else None
path = os.path.join('..', 'data', 'sider_compounds_pubchem.txt')
with open(path) as read_file:
reader = csv.DictReader(read_file, fieldnames=['pubchem_cid'])
rows = list(reader)
rows[:3]
for row in rows:
cid = row['pubchem_cid']
parent_cid = get_pubchem_parent(cid)
cid_props, cid_parent_props = pubchempy.get_properties(
properties=['canonical_smiles'], identifier=[cid, parent_cid], namespace='cid')
row['canonical_smiles'] = cid_props['CanonicalSMILES']
row['pubchem_cid_parent'] = parent_cid
row['canonical_smiles_parent'] = cid_parent_props['CanonicalSMILES']
rows[:3]
collections.Counter(str(row['pubchem_cid']) == str(row['pubchem_cid_parent']) for row in rows)
path = os.path.join('..', 'data', 'compounds.txt')
with open(path, 'w') as write_file:
fieldnames = ['pubchem_cid', 'pubchem_cid_parent', 'canonical_smiles', 'canonical_smiles_parent']
writer = csv.DictWriter(write_file, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
writer.writerows(rows)
For constructing compound networks, compounds.txt can be used as a node attributes table and similarities.txt can be used like a .sif file for edges. To exclude similarities for compound pairs where less than all three methods produce a score, use similarities-complete.txt.