Source code for pyqmmm.md.edia_chain_calculator

"""This script takes EDIA output and cacluates the average chain EDIA scores."""

import matplotlib.pyplot as plt
import pandas as pd


[docs]def get_edia():
    
    # Get user input
    csv = input("Name of your EDIA CSV file without extension?: ")
    chains = input('Chains to compare (e.g., "ABCD")? ')
    residues = input('Key residues of interest (e.g., "9,23,134" or "none"): ')

    # Open CSV file into pandas dataframe
    df = pd.read_csv("{}.csv".format(csv))
    df_all = df[["Chain", "EDIAm", "Median EDIA"]]

    # Examine user chains input and return errors
    chains = chains.upper()
    chain_list = []
    for chain in chains:
        if chain in df_all.Chain.values:
            print("Chain {} found.".format(chain))
            chain_list.append(chain)
        else:
            print("Chain {} not found.".format(chain))
    print("-------------------")
    print()

    # Create empty dataframe to store means
    df_means = pd.DataFrame(
        index=[chain for chain in chain_list], columns=["EDIAm", "EDIA"]
    )
    # Create empty dataframe to store standard error
    df_se = pd.DataFrame(
        index=[chain for chain in chain_list], columns=["EDIAm", "EDIA"]
    )

    # Calculate the Avg. EDIAm, Avg. EDIA, and Std. Error
    for chain in chain_list:
        # Isolate a specific chain
        isolated_chain = df_all.loc[df["Chain"] == chain]

        # Get the mean for EDIAm and EDIA
        avg_ediam = isolated_chain["EDIAm"].mean()
        avg_edia = isolated_chain["Median EDIA"].mean()
        # Store it in the df_means dataframe
        df_means.loc[chain] = [avg_ediam, avg_edia]

        # Get the standard error for EDIAm and EDIA
        std_err_ediam = isolated_chain["EDIAm"].sem()
        std_err_edia = isolated_chain["Median EDIA"].sem()
        # Store it in the df_means dataframe
        df_se.loc[chain] = [std_err_ediam, std_err_edia]

    # Rank order the chains by EDIA score
    sorted_df = df_means.sort_values(by="EDIA", ascending=False)
    print("Your sorted results for all residues:")
    print(sorted_df)
    print("-------------------")
    print()

    # Plot the findings
    # Plot constants
    colors = ["#ff16ea", "#020cfa"]

    # Plot parameters
    plt.rc("axes", linewidth=2.5)
    fig, ax = plt.subplots()
    df_means.plot.bar(yerr=df_se, ax=ax, capsize=4, rot=0, color=colors)
    plt.title("Chain Comparison", fontsize=18)
    plt.ylabel("EDIA Score", fontsize=16)
    plt.xlabel("Chains", fontsize=16)
    plt.xticks(rotation=0)
    plt.tick_params(labelsize=14)
    plt.legend(fontsize=12, loc="lower right")
    plt.savefig("chain_comparison.pdf", bbox_inches="tight")
    print("Figure has been created.")
    print("-------------------")
    print()

    # Select out key columns
    df_res = df[["ID", "Chain", "EDIAm", "Median EDIA"]]

    # Examine user residue input and return errors
    if residues != "none":
        residue_split = residues.split(",")
        residue_split = [int(i) for i in residue_split]
        residue_list = []
        for residue in residue_split:
            if residue in df_res.ID.values:
                print("Residue {} found.".format(residue))
                residue_list.append(residue)
            else:
                print("Residue {} not found.".format(residue))
    print("-------------------")

    # Select only residues the user asked for
    df_sel_res = df_res[df_res.ID.isin(residue_list)]
    for residue in residue_list:
        isolated_residue = df_sel_res.loc[df_sel_res["ID"] == residue]
        chains_present = isolated_residue["Chain"].values
        for chain in chain_list:
            if chain not in chains_present:
                df_row = {"ID": residue, "Chain": chain, "EDIAm": 0, "Median EDIA": 0}
                df_sel_res = df_sel_res.append(df_row, ignore_index=True)

    # Create empty dataframe to store means
    df_res_means = pd.DataFrame(index=chain_list, columns=["EDIAm", "EDIA"])

    # Create empty dataframe to store standard error
    df_res_se = pd.DataFrame(index=chain_list, columns=["EDIAm", "EDIA"])

    # Calculate the Avg. EDIAm, Avg. EDIA, and Std. Error
    for chain in chain_list:
        # Isolate a specific chain
        isolated_residue = df_sel_res.loc[df_sel_res["Chain"] == chain]

        # Get the mean for EDIAm and EDIA
        avg_res_ediam = isolated_residue["EDIAm"].mean()
        avg_res_edia = isolated_residue["Median EDIA"].mean()
        # Store it in the df_means dataframe
        df_res_means.loc[chain] = [avg_res_ediam, avg_res_edia]

        # Get the standard error for EDIAm and EDIA
        std_err_res_ediam = isolated_residue["EDIAm"].sem()
        std_err_res_edia = isolated_residue["Median EDIA"].sem()
        # Store it in the df_means dataframe
        df_res_se.loc[chain] = [std_err_res_ediam, std_err_res_edia]

    return df_res_means


[docs]def edia_chain_calculator():
    # Rank order the chains by EDIA score
    df_res_means = get_edia()
    sorted_res_df = df_res_means.sort_values(by="EDIA", ascending=False)
    print("Your sorted results for specific residues:")
    print(sorted_res_df)
    print("-------------------")
    print()

    # Plot the findings
    plt.rc("axes", linewidth=2.5)
    fig, ax = plt.subplots()
    df_res_means.plot.bar(yerr=df_res_se, ax=ax, capsize=4, rot=0, color=colors)
    plt.title("Chain Comparison for Select Residues", fontsize=18)
    plt.ylabel("EDIA Score", fontsize=16)
    plt.xlabel("Chains", fontsize=16)
    plt.xticks(rotation=0)
    plt.tick_params(labelsize=14)
    plt.legend(fontsize=12, loc="lower right")
    plt.savefig("chain_res_comp.pdf", bbox_inches="tight")
    print("Figure has been created.")
    print("-------------------")


if __name__ == "__main__":
    edia_chain_calculator()