Source code for pyqmmm.md.hbond_compare

"""Analyze data from hydrogen bonding analysis based on hbond.gnu file."""

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import subprocess
import sys


[docs]def bond_labels(file_path, ignore_backbone=True, include_backbone=tuple("DHK")):
    """
    Extracts bond labels from gnu file.

    Parameters
    ----------
    file_path: str
        Path to hbond.gnu file
    ignore_backbone: bool
        Whether to ignore backbone hydrogen bonds (donor/acceptor named N or O)
    include_backbone: tuple(str)
        If ignore_backbone, these residues are the exception

    Returns
    -------
    grouped: pd.DataFrame
        DataFrame containing residue pair and interaction index
    """

    dict = {}
    with open(file_path, "r") as f:
        for line in f:
            if line[:10] == "set ytics(":
                bonds = line.split("(")[1].split(")")[0]
                for b in bonds.split(","):
                    key_val = b.split(" ")
                    dict[int(float(key_val[-1]))] = key_val[0].strip('"')
    labels = pd.Series(dict.values(), index=dict.keys(), name="labels")
    labels = labels.str.split("-", expand=True)
    labels.columns = ["acceptor", "donor", "hydrogen"]
    labels[["acceptor", "acceptor_atom"]] = labels["acceptor"].str.split(
        "@", expand=True
    )
    labels[["donor", "donor_atom"]] = labels["donor"].str.split("@", expand=True)
    # Ignore backbone hydrogen atom donors/acceptors if the acceptor or donor is not in include_backbone
    if ignore_backbone:
        include_regex = "|".join(include_backbone)
        labels = labels[
            ~(
                (
                    labels["acceptor_atom"].isin(["N", "O"])
                    & ~labels["acceptor"].str.contains(include_regex, regex=True)
                )
                | (
                    labels["donor_atom"].isin(["N", "O"])
                    & ~labels["donor"].str.contains(include_regex, regex=True)
                )
            )
        ]
    labels = labels.reset_index()
    grouped = labels.groupby(["acceptor", "donor"])["index"].apply(set).reset_index()

    return grouped


[docs]def count_occurrences(file_path, labels):
    """
    Counts percent occurrences of each hydrogen bond.

    Parameters
    ----------
    file_path: str
        Path to hbond.gnu file
    labels: str
        Hbond labels as outputted by CPPTRAJ in gnu format

    Returns
    -------
    labels: DataFrame containing residue pair and interaction index
    frame_count: total number of frames in trajectory
    """

    labels["count"] = 0
    frame_count = 0
    with open(file_path, "r") as f:
        for _ in range(8):
            next(f)
        for frame in f.read().split("\n\n"):
            frame_count += 1
            bonds = set()
            for line in frame.split("\n"):
                if line == "end":
                    break
                arr = [int(float(x)) for x in line.split(" ") if x]
                if arr[-1] == 1:
                    bonds.add(arr[1])
            contains_bond = [len(i & bonds) != 0 for i in labels["index"]]
            labels.loc[contains_bond, "count"] += 1
    return labels, frame_count


[docs]def process_data(count_df, frame_count, name, substrate):
    """
    Cleans up hbonding data (formats residue names, dataframe index, etc.)

    Parameters
    ----------
    count_df: pd.DataFrame
        DataFrame containing the hydrogen bonding counts
    frame_count: int
        Number of frames in trajectory
    name: str
        System name

    Returns
    -------
    count_df: pd.DataFrame
        Cleaned DataFrame containing the hydrogen bonding counts


    """
    increment = 0  # Is there an offset between the res number and the real res number
    res_ser = pd.concat([count_df["acceptor"], count_df["donor"]], axis=0)
    count_df["residue"] = res_ser[~res_ser.str.contains(substrate)].sort_index()
    count_df["percent_occurrence"] = count_df["count"] / frame_count
    count_df[["residue", "position"]] = count_df["residue"].str.split("_", expand=True)
    count_df["position"] = count_df["position"].astype(int) + increment
    format_residues_dict = {
        "ALA": "A",
        "ARG": "R",
        "ASN": "N",
        "AP1": "D",
        "ASP": "D",
        "CYS": "C",
        "FE1": "FE",
        "GLU": "E",
        "GLN": "Q",
        "GLY": "G",
        "HID": "H",
        "HIS": "H",
        "HIP": "H",
        "HIE": "H",
        "HD1": "H",
        "HD2": "H",
        "ILE": "I",
        "LEU": "L",
        "LYS": "K",
        "MET": "M",
        "OO1": "OXO",
        "PHE": "F",
        "PRO": "P",
        "SER": "S",
        "SC1": "SUC",
        "THR": "T",
        "TRP": "W",
        "TYR": "Y",
        "VAL": "V",
        "_": "",
    }
    for old, new in format_residues_dict.items():
        count_df["residue"] = count_df["residue"].str.replace(old, new, regex=False)
    count_df["residue"] = count_df["residue"] + count_df["position"].astype(str)
    count_df = count_df.rename(columns={"percent_occurrence": name})[
        ["residue", name, "position"]
    ]
    count_df = count_df.set_index("residue")
    count_df[name] = count_df[name] * 100
    return count_df


[docs]def figure_formatting():
    """
    Sets formatting for matplotlib.

    """
    font = {"family": "sans-serif", "weight": "bold", "size": 14}
    plt.rc("font", **font)
    plt.rcParams["svg.fonttype"] = "none"
    plt.rcParams["axes.linewidth"] = 2.5
    plt.rcParams["xtick.major.size"] = 10
    plt.rcParams["xtick.major.width"] = 2.5
    plt.rcParams["ytick.major.size"] = 10
    plt.rcParams["ytick.major.width"] = 2.5
    plt.rcParams["xtick.direction"] = "in"
    plt.rcParams["ytick.direction"] = "in"
    plt.rcParams["mathtext.default"] = "regular"


[docs]def plot(data, file_path):
    """
    Plot single figure for hbonding analysis.

    Parameters
    ----------
    data: pd.DataFrame
        Data that will be used to plot the figure
    file_path: str
        Path to directory where output image should go

    """
    figure_formatting()
    df = data.sort_values("position").drop(["position"], axis=1)
    df = df[df.ge(0.1).all(axis=1)]  # 10% occurence cutoff
    ax = df.plot.bar(color=["Black"])
    ax.set_ylabel("occurrence (%)", weight="bold")
    ax.set_xlabel("residue", weight="bold")
    ax.legend().set_visible(False)
    ax.tick_params(axis="x", labelrotation=90)
    plt.savefig(
        file_path + "hbond.png", bbox_inches="tight", transparent=False, dpi=600
    )


[docs]def plot_multi(data, file_path):
    """
    Plot hbonding comparison between two trajectories.

    Parameters
    ----------
    data: list of two dataframes
    file_path: path to directory where output image should go

    """
    figure_formatting()
    new_df = (
        pd.merge(data[0], data[1], on=["residue", "position"], how="inner")
        .sort_index()
        .sort_values("position")
    )
    new_df = new_df.dropna(axis=0).drop(["position"], axis=1)
    new_df = new_df[new_df.ge(0.1).all(axis=1)]  # 10% ocurrence cutoff
    ax = new_df.plot.bar(color=["Blue", "Red", "Orange"])
    ax.set_ylabel("occurrence (%)", weight="bold")
    ax.set_xlabel("residue", weight="bold")
    ax.legend(bbox_to_anchor=(1.34, 1.02), frameon=False)
    ax.tick_params(axis="x", labelrotation=90)
    plt.savefig(
        file_path + "hbond_comparison.png",
        bbox_inches="tight",
        transparent=False,
        dpi=600,
    )


[docs]def analyze_hbonds(file_paths, names, substrate):
    """
    Driver for analyzing hbonds from hbond.gnu file

    Parameters
    ----------
    file_paths: list[str]
        A list of paths to hbond.gnu files
    names: list[str]
        A list of names of each hbond.gnu files
    """
    data = []
    for file_path, name in zip(file_paths, names):
        data_path = Path(file_path + "hbond.csv")
        if data_path.is_file():
            print(f"{data_path} already exists")
            d = pd.read_csv(file_path + "hbond.csv")
            d = d.set_index("residue")
        else:
            path = file_path + "hbond.gnu"
            print(f"Processing: {path}")
            label_df = bond_labels(path)
            count_df, frame_count = count_occurrences(path, label_df)
            d = process_data(count_df, frame_count, name, substrate)
            d.to_csv(file_path + "hbond.csv")
        plot(d, file_path)
        data.append(d)

    plot_multi(data, file_paths[0])


if __name__ == "__main__":
    """
    First argument contains [data_file1, data_file2] -- two directories containing hbond.gnu files
    --> this will also be the output directories
    Second argument contains [name1, name2] -- names corresponding to the hbond.gnu files (e.g., H127A, Wild-type)
    """

    analyze_hbonds(
        [
            "/Users/kastner/Downloads/test/obtuse/",
            "/Users/kastner/Downloads/test/acute/",
        ],
        ["acute", "obtuse"],
        "DHK",
    )