Source code for pyqmmm.md.gbsa_analyzer

"""Process and analyze output from AMBER GBSA calculation"""

import glob
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype


[docs]def format_plot() -> None:
    """
    General plotting parameters for the Kulik Lab.
    """
    font = {"family": "sans-serif", "weight": "bold", "size": 10}
    plt.rc("font", **font)
    plt.rcParams["xtick.major.pad"] = 5
    plt.rcParams["ytick.major.pad"] = 5
    plt.rcParams["axes.linewidth"] = 2
    plt.rcParams["xtick.major.size"] = 7
    plt.rcParams["xtick.major.width"] = 2
    plt.rcParams["ytick.major.size"] = 7
    plt.rcParams["ytick.major.width"] = 2
    plt.rcParams["xtick.direction"] = "in"
    plt.rcParams["ytick.direction"] = "in"
    plt.rcParams["xtick.top"] = False
    plt.rcParams["ytick.right"] = False
    plt.rcParams["svg.fonttype"] = "none"


[docs]def get_gbsa_df(raw, ignore_residues) -> pd.DataFrame:
    """
    Turn the GBSA file into a parsable pd.DataFrame.

    Parameters
    ----------
    raw: str
        The name of the GBSA output file.

    Returns
    -------
    df: pd.DataFrame
        The raw GBSA file as a pd.DataFrame

    """

    total_energy_keyword = "D,E,L,T,A,S,:"
    sidechain_keyword = "S,i,d,e,c,h,a,i,n, ,E,n,e,r,g,y, ,D,e,c,o,m,p,o,s,i,t,i,o,n,:"
    columns = [
        "Resname 1",
        "Resid 1",
        "Resname 2",
        "Resid 2",
        "Internal",
        "Internal SD",
        "Internal SDM",
        "VDW",
        "VDW SD",
        "VDW SDM",
        "Electrostatic",
        "Electrostatic SD",
        "Electrostatic SDM",
        "Polar",
        "Polar SD",
        "Polar SDM",
        "Non-polar",
        "Non-polar SD",
        "Non-polar SDM",
        "Total",
        "Total SD",
        "Total SDM",
    ]
    csv_file_name = "deltas.csv"

    delta_section = False
    with open(raw, "r") as raw_data, open(csv_file_name, "w") as csv_file:
        for line in raw_data:
            if delta_section:
                if "T,o,t,a,l" in line or "Std" in line or "Resid" in line:
                    if "Resid" in line:
                        csv_file.write(",".join(columns) + "\n")
                    continue
                if sidechain_keyword in line:
                    break
                else:
                    line = ",".join(line.split()) + "\n"
                    csv_file.write(line)

            if line.startswith(total_energy_keyword):
                delta_section = True

    df = pd.read_csv(csv_file_name)
    df = df[~df["Resname 2"].isin(ignore_residues)]
    df = df[df["Resid 1"] != df["Resid 2"]]

    return df


[docs]def update_res_names(df) -> pd.DataFrame:
    """
    Updates odd residue names to more conventional names.

    Parameters
    ----------
    df: pd.DataFrame
        The raw GBSA file read in as a DataFrame.

    Returns
    -------
    df: pd.DataFrame
        The GBSA file as a DataFrame with more intuitive residue names.

    """

    res_names = {
        "AG2": "ARG",
        "AN1": "ASN",
        "HIE": "HIS",
        "TR1": "TYR",
        "HD1": "HIS",
        "HD2": "HIS",
        "AP1": "ASP",
        "CL1": "CL",
        "SC1": "SUC",
    }

    df.reset_index(inplace=True)
    df.replace({"Resname 1": res_names, "Resname 2": res_names}, inplace=True)
    df.insert(4, "Residue", df["Resname 2"] + df["Resid 2"].astype(str))

    return df


[docs]def get_top_hits_df(df, sub_num, num_hits, sorted_x_labels) -> pd.DataFrame:
    """
    Gets the residues with the greatest energetic contributions.

    The user can specify how many they would like to see.

    Parameters
    ----------
    df: pd.DataFrame
        GBSA DataFrame with the updated residue names
    sub_num: int
        The index of your substrate
    num_hits: int
        The number of top hits that the user would like
    sorted_x_labels: list
        List of residue names sorted by total energy

    Returns
    -------
    df_hits: pd.DataFrame
        The DataFrame sorted only for the residues of interest.

    """
    # Get the top largest contributors to ligand interaction energies
    df_hits = df[df["Resid 1"] == sub_num].nsmallest(num_hits, "Total", keep="all")

    residue_order = CategoricalDtype(sorted_x_labels, ordered=True)
    df_hits["Residue"] = df_hits["Residue"].astype(residue_order)
    df_hits = df_hits.sort_values("Residue")

    df_hits.to_csv("top_hits.csv", index=False)

    return df_hits


[docs]def plot_single_total_gbsa(df, file_name) -> None:
    """
    Plot the total GBSA energy scores for each residue.

    Generates a bar plot representing the by-residue total GBSA energy.
    The bar plot is saved as 'gbsa_total.svg' in the current directory.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the residue information and GBSA energies.
    file_name : str
        Name of the file where the plot will be saved.

    """
    colors = "darkgrey"
    ax = df.plot.bar(x="Residue", y="Total", color=colors, figsize=(4, 4))
    ax.set_ylabel("GBSA energy (kcal/mol)", weight="bold")
    ax.set_xlabel("residue", weight="bold")
    
    extensions = ["png", "svg"]
    for ext in extensions:
        plt.savefig(f"{file_name}.{ext}", bbox_inches="tight", format=ext, transparent=True)
    plt.close()


[docs]def plot_clustered_stacked(df, y_columns, sorted_x_labels):
    """
    Plot the GBSA energy by type for a single dataframe.

    This function generates a stacked bar plot representing the GBSA energies.
    for each energy type. The plot is saved as a file in the working directory.

    Parameters
    ----------
    df_hits : pd.DataFrame
        DataFrame containing the residue information and GBSA energies.
    y_columns : list
        List of column names in `df_hits` that represent different components.
    sorted_x_labels : list
        List of residue names in order that they appear on the x-axis.

    """
    format_plot()
    _, axe = plt.subplots(figsize=(4, 4))
    plt.axhline(y=0, color="k", alpha=0.5, linestyle="-", linewidth=3)
    colors = ["#fb8500", "#ffb703", "#023047", "#219ebc", "#8ecae6"]

    residue_order = CategoricalDtype(sorted_x_labels, ordered=True)
    df = df.copy()
    df.loc[:, "Residue"] = df["Residue"].astype(residue_order)
    axe = df.sort_values("Residue").plot.bar(
        x="Residue",
        y=y_columns,
        color=colors,
        linewidth=0,
        stacked=True,
        ax=axe,
        legend=True,
        grid=False,
        width=0.3,
    )

    axe.set_ylabel("GBSA energy (kcal/mol))", weight="bold")
    axe.set_xlabel("residue", weight="bold")
    axe.set_xticks([tick for tick in axe.get_xticks()])

    return axe


[docs]def plot_all_gbsa(df_hits, y_columns, sorted_x_labels) -> None:
    """
    Plot the GBSA energy scores by type for a single dataframe.

    Generates a stacked bar plot representing the GBSA component energies.
    The plot is saved as a file in the working directory.

    Parameters
    ----------
    df_hits : pd.DataFrame
        DataFrame containing the residue information and GBSA energies.
    y_columns : list
        List of column names in `df_hits` that represent component energies.
    sorted_x_labels : list
        List of residue names in the order in which they appear on the x-axis.

    """

    series_columns = y_columns + ["Residue"]
    plot_clustered_stacked(
        df_hits[series_columns],
        y_columns,
        sorted_x_labels,
    )

    extensions = ["png", "svg"]
    for ext in extensions:
        plt.savefig(f"stacked_single.{ext}", bbox_inches="tight", format=ext, transparent=True)


[docs]def analyze() -> None:
    """
    Main GBSA analysis wrapper function for a single dataset.
    """

    print("\n.---------------.")
    print("| GBSA ANALYZER |")
    print(".---------------.\n")
    print("This script will process a single GBSA output file")
    print("Looks for file24.dat\n")

    # Get user input
    sub_num = int(
        input("What is the index of your substrate in your GBSA calculation?: ")
    )
    num_hits = int(input("Show me the top n residues: "))
    ignore_residues = input("What residues would you like ignored (e.g., LS1,LS2)? ").split(',')

    file_extension = "*24.dat"

    # Collect the GBSA data located in the current directory
    raw_files = glob.glob(file_extension)
    raw_files = sorted(raw_files)

    if len(raw_files) == 0:
        print("No *24.dat files found. Please check your directory.")
        return

    raw = raw_files[0]

    # Format plot
    format_plot()

    # Get and process GBSA dataframe
    df = get_gbsa_df(raw, ignore_residues)
    df = update_res_names(df)

    # Generate plots
    df_hits = df[df["Resid 1"] == sub_num].nsmallest(num_hits, "Total", keep="all")
    sorted_x_labels = df_hits["Residue"].tolist()
    df_hits = get_top_hits_df(df, sub_num, num_hits, sorted_x_labels)

    # Plot GBSA Total
    plot_single_total_gbsa(df_hits, "gbsa_total")

    # Plot All GBSA
    plot_all_gbsa(df_hits, ["VDW", "Electrostatic", "Polar", "Non-polar"], sorted_x_labels)


if __name__ == "__main__":
    analyze()