Source code for pyqmmm.md.hbond_analyzer

"""Analyze data from hydrogen bonding analysis based on hbond.gnu file."""

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import subprocess
import sys
import os
import textwrap


[docs]def compute_hbonds(cpptraj_script, submit_script, script_name):
    """
    Calculates the hbonding data if it does not exist.

    Checks if hbonding data has been generated in the current folder.
    If it hasn't, the script submits a a CPPTraj job and then exits.
    Once the CPPTraj job finishes and the job is run again,
    it will check that it finished currently and continue to the analysis.

    Parameters
    ----------
    protein_name: str
        The name of your protein used in the prmtop file name (e.g., TAUD)
    substrate_index: int
        The index of the substrate (e.g., 280)
    residue_range: str
        The range of amino acids for this protein (e.g., 1-243)

    See Also
    --------
    caddkit.md.hbonding_analyzer.analyze_hbonds()

    """
    # Check if hbond.gnu exists in the current directory
    if os.path.exists("hbond.gnu"):
        print(" > Hbonding results found")
    else:
        print(" > No hbond data found, submitting")

        # Write the CPPTRAJ script to a file
        with open(script_name, "w") as f:
            f.write(cpptraj_script)

        # Write the submit script to a file
        with open("submit.sh", "w") as f:
            f.write(submit_script)

        # Submit the job using the submit script
        subprocess.run(["qsub", "submit.sh"])

        print(" > A hbonding job has been submitted")


[docs]def bond_labels(file_path, ignore_backbone=True, include_backbone=tuple("DHK")):
    """
    Extracts bond labels from gnu file.

    Parameters
    ----------
    file_path: str
        Path to hbond.gnu file
    ignore_backbone: bool
        Whether to ignore backbone hydrogen bonds (donor/acceptor named N or O)
    include_backbone: tuple(str)
        If ignore_backbone, these residues are the exception

    Returns
    -------
    grouped: pd.DataFrame
        DataFrame containing residue pair and interaction index
    """

    dict = {}
    with open(file_path, "r") as f:
        for line in f:
            if line[:10] == "set ytics(":
                bonds = line.split("(")[1].split(")")[0]
                for b in bonds.split(","):
                    key_val = b.split(" ")
                    dict[int(float(key_val[-1]))] = key_val[0].strip('"')
    labels = pd.Series(dict.values(), index=dict.keys(), name="labels")
    labels = labels.str.split("-", expand=True)
    labels.columns = ["acceptor", "donor", "hydrogen"]
    labels[["acceptor", "acceptor_atom"]] = labels["acceptor"].str.split(
        "@", expand=True
    )
    labels[["donor", "donor_atom"]] = labels["donor"].str.split("@", expand=True)
    # Ignore backbone hydrogen atom donors/acceptors if the acceptor or donor is not in include_backbone
    if ignore_backbone:
        include_regex = "|".join(include_backbone)
        labels = labels[
            ~(
                (
                    labels["acceptor_atom"].isin(["N", "O"])
                    & ~labels["acceptor"].str.contains(include_regex, regex=True)
                )
                | (
                    labels["donor_atom"].isin(["N", "O"])
                    & ~labels["donor"].str.contains(include_regex, regex=True)
                )
            )
        ]
    labels = labels.reset_index()
    grouped = labels.groupby(["acceptor", "donor"])["index"].apply(set).reset_index()

    return grouped


[docs]def count_occurrences(file_path, labels):
    """
    Counts percent occurrences of each hydrogen bond.

    Parameters
    ----------
    file_path: str
        Path to hbond.gnu file
    labels: str
        Hbond labels as outputted by CPPTRAJ in gnu format

    Returns
    -------
    labels: DataFrame containing residue pair and interaction index
    frame_count: total number of frames in trajectory
    """

    labels["count"] = 0
    frame_count = 0
    with open(file_path, "r") as f:
        for _ in range(8):
            next(f)
        for frame in f.read().split("\n\n"):
            frame_count += 1
            bonds = set()
            for line in frame.split("\n"):
                if line == "end":
                    break
                arr = [int(float(x)) for x in line.split(" ") if x]
                if arr[-1] == 1:
                    bonds.add(arr[1])
            contains_bond = [len(i & bonds) != 0 for i in labels["index"]]
            labels.loc[contains_bond, "count"] += 1
    return labels, frame_count


[docs]def process_data(count_df, frame_count, name, substrate):
    """
    Cleans up hbonding data (formats residue names, dataframe index, etc.)

    Parameters
    ----------
    count_df: pd.DataFrame
        DataFrame containing the hydrogen bonding counts
    frame_count: int
        Number of frames in trajectory
    name: str
        System name

    Returns
    -------
    count_df: pd.DataFrame
        Cleaned DataFrame containing the hydrogen bonding counts


    """
    increment = 0  # Is there an offset between the res number and the real res number
    res_ser = pd.concat([count_df["acceptor"], count_df["donor"]], axis=0)
    count_df["residue"] = res_ser[~res_ser.str.contains(substrate)].sort_index()
    count_df["percent_occurrence"] = count_df["count"] / frame_count
    count_df[["residue", "position"]] = count_df["residue"].str.split("_", expand=True)
    count_df["position"] = count_df["position"].astype(int) + increment
    format_residues_dict = {
        "ALA": "A",
        "ARG": "R",
        "ASN": "N",
        "AP1": "D",
        "ASP": "D",
        "CYS": "C",
        "FE1": "FE",
        "GLU": "E",
        "GLN": "Q",
        "GLY": "G",
        "HID": "H",
        "HIS": "H",
        "HIP": "H",
        "HIE": "H",
        "HD1": "H",
        "HD2": "H",
        "ILE": "I",
        "LEU": "L",
        "LYS": "K",
        "MET": "M",
        "OO1": "OXO",
        "PHE": "F",
        "PRO": "P",
        "SER": "S",
        "SC1": "SUC",
        "THR": "T",
        "TRP": "W",
        "TYR": "Y",
        "VAL": "V",
        "_": "",
    }
    for old, new in format_residues_dict.items():
        count_df["residue"] = count_df["residue"].str.replace(old, new, regex=False)
    count_df["residue"] = count_df["residue"] + count_df["position"].astype(str)
    count_df = count_df.rename(columns={"percent_occurrence": name})[
        ["residue", name, "position"]
    ]
    count_df = count_df.set_index("residue")
    count_df[name] = count_df[name] * 100
    return count_df


[docs]def figure_formatting():
    """
    Sets formatting for matplotlib.

    """
    font = {"family": "sans-serif", "weight": "bold", "size": 14}
    plt.rc("font", **font)
    plt.rcParams["svg.fonttype"] = "none"
    plt.rcParams["axes.linewidth"] = 2.5
    plt.rcParams["xtick.major.size"] = 10
    plt.rcParams["xtick.major.width"] = 2.5
    plt.rcParams["ytick.major.size"] = 10
    plt.rcParams["ytick.major.width"] = 2.5
    plt.rcParams["xtick.direction"] = "in"
    plt.rcParams["ytick.direction"] = "in"
    plt.rcParams["mathtext.default"] = "regular"


[docs]def plot(data, file_path):
    """
    Plot single figure for hbonding analysis.

    Parameters
    ----------
    data: pd.DataFrame
        Data that will be used to plot the figure
    file_path: str
        Path to directory where output image should go

    """
    figure_formatting()
    df = data.sort_values("position").drop(["position"], axis=1)
    df = df[df.ge(0.1).all(axis=1)]  # 10% occurence cutoff

    # Sort dataframe by occurrence in descending order and select the top 7 rows
    df = df.sort_values(by=[df.columns[0]], ascending=False).head(15)

    ax = df.plot.bar(
        color="darkgray", figsize=(4, 4)
    )  # Add edgecolor and facecolor attributes
    ax.set_ylabel("occurrence (%)", weight="bold")
    ax.set_xlabel("residue", weight="bold")
    ax.legend().set_visible(False)
    ax.tick_params(axis="x", labelrotation=90)

    extensions = ["png", "svg"]
    for ext in extensions:
        plt.savefig(file_path + f"hbond.{ext}", format=ext, dpi=600, bbox_inches="tight")


[docs]def plot_multi(data, file_path):
    """
    Plot hbonding comparison between two trajectories.

    Parameters
    ----------
    data: list of two dataframes
    file_path: path to directory where output image should go

    """
    figure_formatting()
    new_df = (
        pd.merge(data[0], data[1], on=["residue", "position"], how="inner")
        .sort_index()
        .sort_values("position")
    )
    new_df = new_df.dropna(axis=0).drop(["position"], axis=1)
    new_df = new_df[new_df.ge(0.1).all(axis=1)]  # 10% ocurrence cutoff
    ax = new_df.plot.bar(color=["Blue", "Red", "Orange"])
    ax.set_ylabel("occurrence (%)", weight="bold")
    ax.set_xlabel("residue", weight="bold")
    ax.legend(bbox_to_anchor=(1.34, 1.02), frameon=False)
    ax.tick_params(axis="x", labelrotation=90)

    extensions = ["png", "svg"]
    for ext in extensions:
        plt.savefig(
            file_path + f"hbond_comparison.{ext}",
            bbox_inches="tight",
            transparent=False,
            dpi=600,
            format=ext, 
        )


[docs]def analyze_hbonds(file_paths, names, substrate):
    """
    Driver for analyzing hbonds from hbond.gnu file

    Parameters
    ----------
    file_paths: list[str]
        A list of paths to hbond.gnu files
    names: list[str]
        A list of names of each hbond.gnu files
    """
    data = []
    for file_path, name in zip(file_paths, names):
        data_path = Path(file_path + "hbond.csv")
        if data_path.is_file():
            print(f"   > {data_path} already exists")
            d = pd.read_csv(file_path + "hbond.csv")
            d = d.set_index("residue")
        else:
            path = file_path + "hbond.gnu"
            print(f"   > Processing: {path}")
            label_df = bond_labels(path)
            count_df, frame_count = count_occurrences(path, label_df)
            d = process_data(count_df, frame_count, name, substrate)
            d.to_csv(file_path + "hbond.csv")
        plot(d, file_path)
        print(f"   > Creating single plot")
        data.append(d)

    if len(names) > 1:
        plot_multi(data, file_paths[0])
        print(f"   > Creating multi plot")


if __name__ == "__main__":
    """
    First argument contains [data_file1, data_file2] -- two directories containing hbond.gnu files
    --> this will also be the output directories
    Second argument contains [name1, name2] -- names corresponding to the hbond.gnu files (e.g., H127A, Wild-type)
    """

    analyze_hbonds(
        [
            "/Users/kastner/Downloads/test/obtuse/",
            "/Users/kastner/Downloads/test/acute/",
        ],
        ["acute", "obtuse"],
        "DHK",
    )