"""Analyze data from hydrogen bonding analysis based on hbond.gnu file."""
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import subprocess
import sys
import os
import textwrap
[docs]def compute_hbonds(cpptraj_script, submit_script, script_name):
"""
Calculates the hbonding data if it does not exist.
Checks if hbonding data has been generated in the current folder.
If it hasn't, the script submits a a CPPTraj job and then exits.
Once the CPPTraj job finishes and the job is run again,
it will check that it finished currently and continue to the analysis.
Parameters
----------
protein_name: str
The name of your protein used in the prmtop file name (e.g., TAUD)
substrate_index: int
The index of the substrate (e.g., 280)
residue_range: str
The range of amino acids for this protein (e.g., 1-243)
See Also
--------
caddkit.md.hbonding_analyzer.analyze_hbonds()
"""
# Check if hbond.gnu exists in the current directory
if os.path.exists("hbond.gnu"):
print(" > Hbonding results found")
else:
print(" > No hbond data found, submitting")
# Write the CPPTRAJ script to a file
with open(script_name, "w") as f:
f.write(cpptraj_script)
# Write the submit script to a file
with open("submit.sh", "w") as f:
f.write(submit_script)
# Submit the job using the submit script
subprocess.run(["qsub", "submit.sh"])
print(" > A hbonding job has been submitted")
[docs]def bond_labels(file_path, ignore_backbone=True, include_backbone=tuple("DHK")):
"""
Extracts bond labels from gnu file.
Parameters
----------
file_path: str
Path to hbond.gnu file
ignore_backbone: bool
Whether to ignore backbone hydrogen bonds (donor/acceptor named N or O)
include_backbone: tuple(str)
If ignore_backbone, these residues are the exception
Returns
-------
grouped: pd.DataFrame
DataFrame containing residue pair and interaction index
"""
dict = {}
with open(file_path, "r") as f:
for line in f:
if line[:10] == "set ytics(":
bonds = line.split("(")[1].split(")")[0]
for b in bonds.split(","):
key_val = b.split(" ")
dict[int(float(key_val[-1]))] = key_val[0].strip('"')
labels = pd.Series(dict.values(), index=dict.keys(), name="labels")
labels = labels.str.split("-", expand=True)
labels.columns = ["acceptor", "donor", "hydrogen"]
labels[["acceptor", "acceptor_atom"]] = labels["acceptor"].str.split(
"@", expand=True
)
labels[["donor", "donor_atom"]] = labels["donor"].str.split("@", expand=True)
# Ignore backbone hydrogen atom donors/acceptors if the acceptor or donor is not in include_backbone
if ignore_backbone:
include_regex = "|".join(include_backbone)
labels = labels[
~(
(
labels["acceptor_atom"].isin(["N", "O"])
& ~labels["acceptor"].str.contains(include_regex, regex=True)
)
| (
labels["donor_atom"].isin(["N", "O"])
& ~labels["donor"].str.contains(include_regex, regex=True)
)
)
]
labels = labels.reset_index()
grouped = labels.groupby(["acceptor", "donor"])["index"].apply(set).reset_index()
return grouped
[docs]def count_occurrences(file_path, labels):
"""
Counts percent occurrences of each hydrogen bond.
Parameters
----------
file_path: str
Path to hbond.gnu file
labels: str
Hbond labels as outputted by CPPTRAJ in gnu format
Returns
-------
labels: DataFrame containing residue pair and interaction index
frame_count: total number of frames in trajectory
"""
labels["count"] = 0
frame_count = 0
with open(file_path, "r") as f:
for _ in range(8):
next(f)
for frame in f.read().split("\n\n"):
frame_count += 1
bonds = set()
for line in frame.split("\n"):
if line == "end":
break
arr = [int(float(x)) for x in line.split(" ") if x]
if arr[-1] == 1:
bonds.add(arr[1])
contains_bond = [len(i & bonds) != 0 for i in labels["index"]]
labels.loc[contains_bond, "count"] += 1
return labels, frame_count
[docs]def process_data(count_df, frame_count, name, substrate):
"""
Cleans up hbonding data (formats residue names, dataframe index, etc.)
Parameters
----------
count_df: pd.DataFrame
DataFrame containing the hydrogen bonding counts
frame_count: int
Number of frames in trajectory
name: str
System name
Returns
-------
count_df: pd.DataFrame
Cleaned DataFrame containing the hydrogen bonding counts
"""
increment = 0 # Is there an offset between the res number and the real res number
res_ser = pd.concat([count_df["acceptor"], count_df["donor"]], axis=0)
count_df["residue"] = res_ser[~res_ser.str.contains(substrate)].sort_index()
count_df["percent_occurrence"] = count_df["count"] / frame_count
count_df[["residue", "position"]] = count_df["residue"].str.split("_", expand=True)
count_df["position"] = count_df["position"].astype(int) + increment
format_residues_dict = {
"ALA": "A",
"ARG": "R",
"ASN": "N",
"AP1": "D",
"ASP": "D",
"CYS": "C",
"FE1": "FE",
"GLU": "E",
"GLN": "Q",
"GLY": "G",
"HID": "H",
"HIS": "H",
"HIP": "H",
"HIE": "H",
"HD1": "H",
"HD2": "H",
"ILE": "I",
"LEU": "L",
"LYS": "K",
"MET": "M",
"OO1": "OXO",
"PHE": "F",
"PRO": "P",
"SER": "S",
"SC1": "SUC",
"THR": "T",
"TRP": "W",
"TYR": "Y",
"VAL": "V",
"_": "",
}
for old, new in format_residues_dict.items():
count_df["residue"] = count_df["residue"].str.replace(old, new, regex=False)
count_df["residue"] = count_df["residue"] + count_df["position"].astype(str)
count_df = count_df.rename(columns={"percent_occurrence": name})[
["residue", name, "position"]
]
count_df = count_df.set_index("residue")
count_df[name] = count_df[name] * 100
return count_df
[docs]def plot(data, file_path):
"""
Plot single figure for hbonding analysis.
Parameters
----------
data: pd.DataFrame
Data that will be used to plot the figure
file_path: str
Path to directory where output image should go
"""
figure_formatting()
df = data.sort_values("position").drop(["position"], axis=1)
df = df[df.ge(0.1).all(axis=1)] # 10% occurence cutoff
# Sort dataframe by occurrence in descending order and select the top 7 rows
df = df.sort_values(by=[df.columns[0]], ascending=False).head(15)
ax = df.plot.bar(
color="darkgray", figsize=(4, 4)
) # Add edgecolor and facecolor attributes
ax.set_ylabel("occurrence (%)", weight="bold")
ax.set_xlabel("residue", weight="bold")
ax.legend().set_visible(False)
ax.tick_params(axis="x", labelrotation=90)
extensions = ["png", "svg"]
for ext in extensions:
plt.savefig(file_path + f"hbond.{ext}", format=ext, dpi=600, bbox_inches="tight")
[docs]def plot_multi(data, file_path):
"""
Plot hbonding comparison between two trajectories.
Parameters
----------
data: list of two dataframes
file_path: path to directory where output image should go
"""
figure_formatting()
new_df = (
pd.merge(data[0], data[1], on=["residue", "position"], how="inner")
.sort_index()
.sort_values("position")
)
new_df = new_df.dropna(axis=0).drop(["position"], axis=1)
new_df = new_df[new_df.ge(0.1).all(axis=1)] # 10% ocurrence cutoff
ax = new_df.plot.bar(color=["Blue", "Red", "Orange"])
ax.set_ylabel("occurrence (%)", weight="bold")
ax.set_xlabel("residue", weight="bold")
ax.legend(bbox_to_anchor=(1.34, 1.02), frameon=False)
ax.tick_params(axis="x", labelrotation=90)
extensions = ["png", "svg"]
for ext in extensions:
plt.savefig(
file_path + f"hbond_comparison.{ext}",
bbox_inches="tight",
transparent=False,
dpi=600,
format=ext,
)
[docs]def analyze_hbonds(file_paths, names, substrate):
"""
Driver for analyzing hbonds from hbond.gnu file
Parameters
----------
file_paths: list[str]
A list of paths to hbond.gnu files
names: list[str]
A list of names of each hbond.gnu files
"""
data = []
for file_path, name in zip(file_paths, names):
data_path = Path(file_path + "hbond.csv")
if data_path.is_file():
print(f" > {data_path} already exists")
d = pd.read_csv(file_path + "hbond.csv")
d = d.set_index("residue")
else:
path = file_path + "hbond.gnu"
print(f" > Processing: {path}")
label_df = bond_labels(path)
count_df, frame_count = count_occurrences(path, label_df)
d = process_data(count_df, frame_count, name, substrate)
d.to_csv(file_path + "hbond.csv")
plot(d, file_path)
print(f" > Creating single plot")
data.append(d)
if len(names) > 1:
plot_multi(data, file_paths[0])
print(f" > Creating multi plot")
if __name__ == "__main__":
"""
First argument contains [data_file1, data_file2] -- two directories containing hbond.gnu files
--> this will also be the output directories
Second argument contains [name1, name2] -- names corresponding to the hbond.gnu files (e.g., H127A, Wild-type)
"""
analyze_hbonds(
[
"/Users/kastner/Downloads/test/obtuse/",
"/Users/kastner/Downloads/test/acute/",
],
["acute", "obtuse"],
"DHK",
)