DTI 1c: Processed Drug System Tables
Enriching intermediate drug system tables with Polars.
This release of the drug-target interaction series will focus on building the processed drug system tables – Ligand_Molecule, Ligand_Atom, and Ligand_Bond – from the interim variants and the Ligand_Ring table. These tables were generated in a prior step by looping through the molecule supply SDF file.
Target schemas
Molecule
Atom
Bond
Sourcing
import polars as pl
= pl.read_parquet(interim_drug_system_root_path / "mol/*")
interim_molecule = pl.read_parquet(interim_drug_system_root_path / "ring/*")
ring = pl.read_parquet(interim_drug_system_root_path / "atom/*")
interim_atom = pl.read_parquet(interim_drug_system_root_path / "bond/*") interim_bond
Molecule
The interim Molecule table lacks the mean_atomic_weight
and bonds_per_atom
features.
mean_atomic_weight
# Note: `weight` in the _Atom_ table is the average atomic weight of the given element.
# It _doesn't_ take isotopes into account.
# `weight` in the _Molecule_ table _does_ take isotopes into account.
= (
mean_atom_weight_lookup "molecule_id")
interim_atom.group_by("weight").alias("mean_atomic_weight"))
.agg(pl.mean("molecule_id")
.sort( )
bonds_per_atom
= (
bonds_per_atom_lookup
interim_bond.melt(=["molecule_id"], value_vars=["atom1_index", "atom2_index"], value_name="atom_index"
id_vars
)"molecule_id", "atom_index"])
.group_by([len()
."molecule_id")
.group_by("len").alias("bonds_per_atom"))
.agg(pl.mean("molecule_id")
.sort( )
Join
= interim_molecule.join(
molecule "molecule_id": "id"}), on="id", how="left"
mean_atom_weight_lookup.rename({"molecule_id": "id"}), on="id", how="left") ).join(bonds_per_atom_lookup.rename({
molecule
shape: (1_277_006, 9)
id | supplier_index | name | smiles | n_atoms | n_bonds | weight | mean_atomic_weight | bonds_per_atom |
---|---|---|---|---|---|---|---|---|
i64 | i64 | str | str | i64 | i64 | f64 | f64 | f64 |
608734 | 0 | "6-[(4R,5S,6S,7R)-4,7-dibenzyl-… | "O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)… | 40 | 42 | 554.299202 | 12.8087 | 2.1 |
22 | 1 | "(4R,5S,6S,7R)-4,7-dibenzyl-5,6… | "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… | 40 | 44 | 538.269239 | 12.6095 | 2.2 |
23 | 2 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… | 36 | 40 | 486.263091 | 12.565111 | 2.222222 |
24 | 3 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(CCCCCCO)[C@H](Cc2ccccc2)… | 35 | 38 | 480.298808 | 12.580829 | 2.171429 |
25 | 4 | "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… | "O=C1N(CCCCCO)[C@H](Cc2ccccc2)[… | 34 | 37 | 466.283158 | 12.597588 | 2.176471 |
… | … | … | … | … | … | … | … | … |
492310 | 999964 | "US10975068, Example 451" | "O=C(N[C@H]1CCC[C@@H]1O)c1nc(C(… | 41 | 46 | 599.198952 | 13.887463 | 2.243902 |
492311 | 999965 | "US10975068, Comparator Example… | "C[C@H]1CCCN1C(=O)c1nc(C(=O)NCC… | 36 | 39 | 510.24131 | 13.233111 | 2.166667 |
492313 | 999968 | "US10975068, Comparator Example… | "CC(C)S(=O)(=O)Nc1cc(C(F)(F)F)c… | 39 | 41 | 595.154624 | 14.523308 | 2.102564 |
492332 | 999988 | "7-Methoxy-3-methyl-1-(3- methy… | "COc1cc2ncc3c(c2cc1-c1c[nH]nc1C… | 30 | 34 | 400.164774 | 12.676067 | 4.533333 |
492336 | 999992 | "7-Methoxy-3-methyl-1,8-bis- (1… | "COc1cc2ncc3c(c2cc1-c1cn(C)nc1C… | 30 | 34 | 403.175673 | 12.7426 | 4.533333 |
Write
/ "processed/molecule.parquet") molecule.write_parquet(data_path
Atom
The Atom table lacks the ring_size_[]_count
fields.
ring_size_[]_count
= (3, 8)
ring_size_range = [
ring_size_count_columns f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)
]
= (
ring_size_count_lookup "atom_indices")
ring.explode("atom_indices": "atom_index"})
.rename({"molecule_id", "atom_index", "size"])
.group_by([len()
.filter((pl.col("size") >= ring_size_range[0]) & (pl.col("size") <= ring_size_range[1]))
.=["molecule_id", "atom_index"], columns="size", values="len")
.pivot(index# insert ring sizes in specified range _if_ not already present
.pipe(lambda df: df.select(
all(),
pl.*[
None).alias(str(i))
pl.lit(for i in range(ring_size_range[0], ring_size_range[1] + 1)
if str(i) not in df
],
)
)
.rename(str(i): f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)}
{
)0)
.fill_null("molecule_id", pl.col("atom_index").alias("index"), *ring_size_count_columns)
.select("molecule_id", "index")
.sort( )
Join
= interim_atom.join(ring_size_count_lookup, on=["molecule_id", "index"], how="left").select(
atom *[pl.col(col) for col in interim_atom.columns[:4]],
*ring_size_count_columns,
*[pl.col(col) for col in interim_atom.columns[4:]]
)
atom
shape: (41_723_806, 18)
molecule_id | index | symbol | weight | ring_size_3_count | ring_size_4_count | ring_size_5_count | ring_size_6_count | ring_size_7_count | ring_size_8_count | chirality | hybridization | acceptor | donor | aromatic | x | y | z |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i64 | i64 | str | f64 | i64 | i64 | i64 | i64 | i64 | i64 | str | str | bool | bool | bool | f64 | f64 | f64 |
608734 | 0 | "O" | 15.999 | null | null | null | null | null | null | null | "SP2" | true | false | false | -1.011 | 3.174 | -6.577 |
608734 | 1 | "C" | 12.011 | null | null | null | null | null | null | null | "SP2" | false | false | false | -2.049 | 3.469 | -6.009 |
608734 | 2 | "O" | 15.999 | null | null | null | null | null | null | null | "SP2" | false | true | false | -2.863 | 4.337 | -6.577 |
608734 | 3 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | -2.393 | 2.867 | -4.752 |
608734 | 4 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | -2.502 | 3.929 | -3.645 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
492336 | 25 | "N" | 14.007 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | true | false | true | 1.47 | -5.32 | 0.007 |
492336 | 26 | "N" | 14.007 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | true | false | true | 1.579 | -4.954 | -1.234 |
492336 | 27 | "C" | 12.011 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | false | false | true | 0.386 | -4.576 | -1.722 |
492336 | 28 | "C" | 12.011 | 0 | 0 | 2 | 0 | 0 | 0 | null | "SP2" | false | false | true | -0.54 | -4.712 | -0.701 |
492336 | 29 | "C" | 12.011 | null | null | null | null | null | null | null | "SP3" | false | false | false | 2.805 | -4.955 | -1.945 |
Write
/ "processed/atom.parquet") atom.write_parquet(data_path
Bond
The Bond table lacks the same_ring
boolean feature.
same_ring
In PySpark, this problem could be solved with a conditional join:
= (
same_ring_bonds
interim_bond"molecule_id", "index", "atom1_index", "atom2_index")
.select(
.join("atom_indices": "ring_atom_indices"}).select("ring_atom_indices"),
ring.WithColumnsRenamed({=[
on"molecule_id") == psf.col("molecule_id"),
psf.col("atom1_index").isin(psf.col("ring_atom_indices"),
psf.col("atom2_index").isin(psf.col("ring_atom_indices")),
psf.col(
],="inner"
how
)"molecule_id", "index")
.select( )
Polars doesn’t allow complex join logic like this so the operation must be split up into multiple parts.
= interim_bond.select("molecule_id", "index", "atom1_index").join(
bond_atom1_rings
ring"molecule_id", "index", "atom_indices")
.select("atom_indices")
.explode("atom_indices": "atom1_index", "index": "ring_index"})
.rename({"molecule_id", "ring_index", "atom1_index"),
.select(=["molecule_id", "atom1_index"],
on="inner",
how
).unique()
= interim_bond.select("molecule_id", "index", "atom2_index").join(
bond_atom2_rings
ring"molecule_id", "index", "atom_indices")
.select("atom_indices")
.explode("atom_indices": "atom2_index", "index": "ring_index"})
.rename({"molecule_id", "ring_index", "atom2_index"),
.select(=["molecule_id", "atom2_index"],
on="inner",
how
).unique()
= bond_atom1_rings.join(
same_ring_lookup =["molecule_id", "index", "ring_index"], how="inner"
bond_atom2_rings, on"molecule_id", "index", pl.lit(True).alias("same_ring")).unique() ).select(
= (
bond
interim_bond
.join(
same_ring_lookup,=["molecule_id", "index"],
on="left"
how
)
.with_columns("same_ring").fill_null(False)
pl.col(
) )
bond
shape: (45_597_555, 7)
molecule_id | index | atom1_index | atom2_index | type | stereochemistry | same_ring |
---|---|---|---|---|---|---|
i64 | i64 | i64 | i64 | str | str | bool |
608734 | 0 | 0 | 1 | "double" | null | false |
608734 | 1 | 1 | 2 | "single" | null | false |
608734 | 2 | 1 | 3 | "single" | null | false |
608734 | 3 | 3 | 4 | "single" | null | false |
608734 | 4 | 4 | 5 | "single" | null | false |
… | … | … | … | … | … | … |
492336 | 29 | 24 | 28 | "aromatic" | null | true |
492336 | 30 | 25 | 26 | "aromatic" | null | true |
492336 | 31 | 26 | 27 | "aromatic" | null | true |
492336 | 32 | 26 | 29 | "single" | null | false |
492336 | 33 | 27 | 28 | "aromatic" | null | true |
Write
/ "processed/bond.parquet") bond.write_parquet(data_path