DTI 1c: Processed Drug System Tables

Enriching intermediate drug system tables with Polars.

Published

May 17, 2024

This release of the drug-target interaction series will focus on building the processed drug system tables – Ligand_Molecule, Ligand_Atom, and Ligand_Bond – from the interim variants and the Ligand_Ring table. These tables were generated in a prior step by looping through the molecule supply SDF file.

erDiagram
    Reaction
    Target

    Ligand_Molecule
    Ligand_Atom
    Ligand_Bond

    Reaction }o--|| Ligand_Molecule : "reacts"
    Reaction }o--|| Target : "reacts"
    Ligand_Molecule ||--|{ Ligand_Atom : "contains"
    Ligand_Bond }|--|{ Ligand_Atom : "bonds"

Target schemas

Molecule

erDiagram
    Molecule {
        int id PK
        varchar name
        varchar smiles
        int n_atoms
        int n_bonds
        float weight
        float mean_atomic_weight "Average of average (not exact) weights of each atom within the molecule"
        float bonds_per_atom "Average bound count of each atom within the molecule"
    }

Atom

erDiagram
    Atom {
        int molecule_id PK, FK
        int index PK
        categorical symbol
        categorical chirality
        int ring_size_[n]_count
        categorical hybridization
        bool acceptor
        bool donor
        bool aromatic
        float x
        float y
        float z
    }

Bond

erDiagram
    Bond {
        int molecule_id PK, FK
        int index PK
        int atom1_index FK
        int atom2_index FK
        categorical type
        bool same_ring
    }

Sourcing

import polars as pl

interim_molecule = pl.read_parquet(interim_drug_system_root_path / "mol/*")
ring = pl.read_parquet(interim_drug_system_root_path / "ring/*")
interim_atom = pl.read_parquet(interim_drug_system_root_path / "atom/*")
interim_bond = pl.read_parquet(interim_drug_system_root_path / "bond/*")

Molecule

The interim Molecule table lacks the mean_atomic_weight and bonds_per_atom features.

mean_atomic_weight

# Note: `weight` in the _Atom_ table is the average atomic weight of the given element.
# It _doesn't_ take isotopes into account.
# `weight` in the _Molecule_ table _does_ take isotopes into account.
mean_atom_weight_lookup = (
    interim_atom.group_by("molecule_id")
    .agg(pl.mean("weight").alias("mean_atomic_weight"))
    .sort("molecule_id")
)

bonds_per_atom

bonds_per_atom_lookup = (
    interim_bond.melt(
        id_vars=["molecule_id"], value_vars=["atom1_index", "atom2_index"], value_name="atom_index"
    )
    .group_by(["molecule_id", "atom_index"])
    .len()
    .group_by("molecule_id")
    .agg(pl.mean("len").alias("bonds_per_atom"))
    .sort("molecule_id")
)

Join

molecule = interim_molecule.join(
    mean_atom_weight_lookup.rename({"molecule_id": "id"}), on="id", how="left"
).join(bonds_per_atom_lookup.rename({"molecule_id": "id"}), on="id", how="left")
molecule
shape: (1_277_006, 9)
id supplier_index name smiles n_atoms n_bonds weight mean_atomic_weight bonds_per_atom
i64 i64 str str i64 i64 f64 f64 f64
608734 0 "6-[(4R,5S,6S,7R)-4,7-dibenzyl-… "O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)… 40 42 554.299202 12.8087 2.1
22 1 "(4R,5S,6S,7R)-4,7-dibenzyl-5,6… "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… 40 44 538.269239 12.6095 2.2
23 2 "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… "O=C1N(C/C=C/c2cn[nH]c2)[C@H](C… 36 40 486.263091 12.565111 2.222222
24 3 "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… "O=C1N(CCCCCCO)[C@H](Cc2ccccc2)… 35 38 480.298808 12.580829 2.171429
25 4 "(4R,5S,6S,7R)-4,7-dibenzyl-1-(… "O=C1N(CCCCCO)[C@H](Cc2ccccc2)[… 34 37 466.283158 12.597588 2.176471
492310 999964 "US10975068, Example 451" "O=C(N[C@H]1CCC[C@@H]1O)c1nc(C(… 41 46 599.198952 13.887463 2.243902
492311 999965 "US10975068, Comparator Example… "C[C@H]1CCCN1C(=O)c1nc(C(=O)NCC… 36 39 510.24131 13.233111 2.166667
492313 999968 "US10975068, Comparator Example… "CC(C)S(=O)(=O)Nc1cc(C(F)(F)F)c… 39 41 595.154624 14.523308 2.102564
492332 999988 "7-Methoxy-3-methyl-1-(3- methy… "COc1cc2ncc3c(c2cc1-c1c[nH]nc1C… 30 34 400.164774 12.676067 4.533333
492336 999992 "7-Methoxy-3-methyl-1,8-bis- (1… "COc1cc2ncc3c(c2cc1-c1cn(C)nc1C… 30 34 403.175673 12.7426 4.533333

Write

molecule.write_parquet(data_path / "processed/molecule.parquet")

Atom

The Atom table lacks the ring_size_[]_count fields.

ring_size_[]_count

ring_size_range = (3, 8)
ring_size_count_columns = [
    f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)
]

ring_size_count_lookup = (
    ring.explode("atom_indices")
    .rename({"atom_indices": "atom_index"})
    .group_by(["molecule_id", "atom_index", "size"])
    .len()
    .filter((pl.col("size") >= ring_size_range[0]) & (pl.col("size") <= ring_size_range[1]))
    .pivot(index=["molecule_id", "atom_index"], columns="size", values="len")
    # insert ring sizes in specified range _if_ not already present
    .pipe(
        lambda df: df.select(
            pl.all(),
            *[
                pl.lit(None).alias(str(i))
                for i in range(ring_size_range[0], ring_size_range[1] + 1)
                if str(i) not in df
            ],
        )
    )
    .rename(
        {str(i): f"ring_size_{i}_count" for i in range(ring_size_range[0], ring_size_range[1] + 1)}
    )
    .fill_null(0)
    .select("molecule_id", pl.col("atom_index").alias("index"), *ring_size_count_columns)
    .sort("molecule_id", "index")
)

Join

atom = interim_atom.join(ring_size_count_lookup, on=["molecule_id", "index"], how="left").select(
    *[pl.col(col) for col in interim_atom.columns[:4]],
    *ring_size_count_columns,
    *[pl.col(col) for col in interim_atom.columns[4:]]
)
atom
shape: (41_723_806, 18)
molecule_id index symbol weight ring_size_3_count ring_size_4_count ring_size_5_count ring_size_6_count ring_size_7_count ring_size_8_count chirality hybridization acceptor donor aromatic x y z
i64 i64 str f64 i64 i64 i64 i64 i64 i64 str str bool bool bool f64 f64 f64
608734 0 "O" 15.999 null null null null null null null "SP2" true false false -1.011 3.174 -6.577
608734 1 "C" 12.011 null null null null null null null "SP2" false false false -2.049 3.469 -6.009
608734 2 "O" 15.999 null null null null null null null "SP2" false true false -2.863 4.337 -6.577
608734 3 "C" 12.011 null null null null null null null "SP3" false false false -2.393 2.867 -4.752
608734 4 "C" 12.011 null null null null null null null "SP3" false false false -2.502 3.929 -3.645
492336 25 "N" 14.007 0 0 2 0 0 0 null "SP2" true false true 1.47 -5.32 0.007
492336 26 "N" 14.007 0 0 2 0 0 0 null "SP2" true false true 1.579 -4.954 -1.234
492336 27 "C" 12.011 0 0 2 0 0 0 null "SP2" false false true 0.386 -4.576 -1.722
492336 28 "C" 12.011 0 0 2 0 0 0 null "SP2" false false true -0.54 -4.712 -0.701
492336 29 "C" 12.011 null null null null null null null "SP3" false false false 2.805 -4.955 -1.945

Write

atom.write_parquet(data_path / "processed/atom.parquet")

Bond

The Bond table lacks the same_ring boolean feature.

same_ring

In PySpark, this problem could be solved with a conditional join:

same_ring_bonds = (
    interim_bond
    .select("molecule_id", "index", "atom1_index", "atom2_index")
    .join(
        ring.WithColumnsRenamed({"atom_indices": "ring_atom_indices"}).select("ring_atom_indices"),
        on=[
            psf.col("molecule_id") == psf.col("molecule_id"), 
            psf.col("atom1_index").isin(psf.col("ring_atom_indices"), 
            psf.col("atom2_index").isin(psf.col("ring_atom_indices")),
        ],
        how="inner"
    )
    .select("molecule_id", "index")
)

Polars doesn’t allow complex join logic like this so the operation must be split up into multiple parts.

bond_atom1_rings = interim_bond.select("molecule_id", "index", "atom1_index").join(
    ring
    .select("molecule_id", "index", "atom_indices")
    .explode("atom_indices")
    .rename({"atom_indices": "atom1_index", "index": "ring_index"})
    .select("molecule_id", "ring_index", "atom1_index"),
    on=["molecule_id", "atom1_index"],
    how="inner",
).unique()

bond_atom2_rings = interim_bond.select("molecule_id", "index", "atom2_index").join(
    ring
    .select("molecule_id", "index", "atom_indices")
    .explode("atom_indices")
    .rename({"atom_indices": "atom2_index", "index": "ring_index"})
    .select("molecule_id", "ring_index", "atom2_index"),
    on=["molecule_id", "atom2_index"],
    how="inner",
).unique()

same_ring_lookup = bond_atom1_rings.join(
    bond_atom2_rings, on=["molecule_id", "index", "ring_index"], how="inner"
).select("molecule_id", "index", pl.lit(True).alias("same_ring")).unique()
bond = (
    interim_bond
    .join(
        same_ring_lookup,
        on=["molecule_id", "index"],
        how="left"
    )
    .with_columns(
        pl.col("same_ring").fill_null(False)
    )
)
bond
shape: (45_597_555, 7)
molecule_id index atom1_index atom2_index type stereochemistry same_ring
i64 i64 i64 i64 str str bool
608734 0 0 1 "double" null false
608734 1 1 2 "single" null false
608734 2 1 3 "single" null false
608734 3 3 4 "single" null false
608734 4 4 5 "single" null false
492336 29 24 28 "aromatic" null true
492336 30 25 26 "aromatic" null true
492336 31 26 27 "aromatic" null true
492336 32 26 29 "single" null false
492336 33 27 28 "aromatic" null true

Write

bond.write_parquet(data_path / "processed/bond.parquet")