|
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
|
|
|
|
INVALID_ANGLE = 10 # invalid angle value
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description='compute_phi_psi.py')
|
|
parser.add_argument('fname')
|
|
parser.add_argument('-plot', action='store_true')
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
class running_stats:
|
|
'''this class computes mean and std values for a
|
|
list of values across proteins'''
|
|
|
|
def __init__(self, name_str):
|
|
self.name = name_str
|
|
self.x = 0.0
|
|
self.x2 = 0.0
|
|
self.n = 0
|
|
|
|
def update(self, Vin):
|
|
V = np.array(Vin)
|
|
self.x += np.sum(V)
|
|
self.x2 += np.sum(V**2)
|
|
self.n += len(Vin)
|
|
|
|
def print_stats(self):
|
|
mu = self.x / self.n
|
|
std = np.sqrt(self.x2/self.n - mu**2)
|
|
print("bond length", self.name, mu, std)
|
|
|
|
|
|
class compute_stats:
|
|
'''Compute mean and std values for bond lengths and bond angles'''
|
|
|
|
def __init__(self):
|
|
|
|
self.CN = running_stats("CN")
|
|
self.NCa = running_stats("NCa")
|
|
self.CaC = running_stats("CaC")
|
|
|
|
self.CNCa = running_stats("CNCa")
|
|
self.NCaC = running_stats("NCaC")
|
|
self.CaCN = running_stats("CaCN")
|
|
|
|
def update(self, v_CN, v_NCa, v_CaC, v_CNCa, v_NCaC, v_CaCN):
|
|
self.CN.update(v_CN)
|
|
self.NCa.update(v_NCa)
|
|
self.CaC.update(v_CaC)
|
|
|
|
self.CNCa.update(v_CNCa)
|
|
self.NCaC.update(v_NCaC)
|
|
self.CaCN.update(v_CaCN)
|
|
|
|
def print_stats(self):
|
|
self.CN.print_stats()
|
|
self.NCa.print_stats()
|
|
self.CaC.print_stats()
|
|
|
|
self.CNCa.print_stats()
|
|
self.NCaC.print_stats()
|
|
self.CaCN.print_stats()
|
|
|
|
|
|
def process_tertiary(tertiary):
|
|
'''compute the bond lengths, bond angles, and dihedral angles'''
|
|
phi = []
|
|
psi = []
|
|
omega = []
|
|
bond_angle_CNCa = []
|
|
bond_angle_NCaC = []
|
|
bond_angle_CaCN = []
|
|
bond_len_NCa = []
|
|
bond_len_CaC = []
|
|
bond_len_CN = []
|
|
# convert tertiary coords into Vectors
|
|
pV = [vec for vec in map(lambda v: Vector(v[0], v[1], v[2]),
|
|
zip(tertiary[0], tertiary[1], tertiary[2]))]
|
|
|
|
for i in range(0, len(pV), 3):
|
|
# check for zero coords
|
|
norm_im1 = False
|
|
norm_i = False
|
|
norm_i1 = False
|
|
norm_i2 = False
|
|
norm_i3 = False
|
|
norm_i4 = False
|
|
if i > 0 and pV[i-1].norm() > 0:
|
|
norm_im1 = True
|
|
if pV[i].norm() > 0:
|
|
norm_i = True
|
|
if pV[i+1].norm() > 0:
|
|
norm_i1 = True
|
|
if pV[i+2].norm() > 0:
|
|
norm_i2 = True
|
|
if i + 3 < len(pV) and pV[i+3].norm() > 0:
|
|
norm_i3 = True
|
|
if i + 3 < len(pV) and pV[i+4].norm() > 0:
|
|
norm_i4 = True
|
|
|
|
# compute bond lengths
|
|
if norm_im1 and norm_i:
|
|
blen_CN = (pV[i-1]-pV[i]).norm()
|
|
bond_len_CN.append(blen_CN)
|
|
|
|
if norm_i and norm_i1:
|
|
blen_NCa = (pV[i]-pV[i+1]).norm()
|
|
bond_len_NCa.append(blen_NCa)
|
|
|
|
if norm_i1 and norm_i2:
|
|
blen_CaC = (pV[i+1]-pV[i+2]).norm()
|
|
bond_len_CaC.append(blen_CaC)
|
|
|
|
# compute bond angles
|
|
if norm_im1 and norm_i and norm_i1:
|
|
theta_CNCa = calc_angle(pV[i-1], pV[i], pV[i+1]) # C-N-Ca
|
|
bond_angle_CNCa.append(theta_CNCa)
|
|
|
|
if norm_i and norm_i1 and norm_i2:
|
|
theta_NCaC = calc_angle(pV[i], pV[i+1], pV[i+2]) # N-Ca-C
|
|
bond_angle_NCaC.append(theta_NCaC)
|
|
|
|
if norm_i1 and norm_i2 and norm_i3:
|
|
theta_CaCN = calc_angle(pV[i+1], pV[i+2], pV[i+3]) # Ca-C-N
|
|
bond_angle_CaCN.append(theta_CaCN)
|
|
|
|
# compute dihedral angles
|
|
if norm_im1 and norm_i and norm_i1 and norm_i2:
|
|
phi_i = calc_dihedral(
|
|
pV[i-1], pV[i], pV[i+1], pV[i+2]) # N-Ca-C-N
|
|
else:
|
|
phi_i = INVALID_ANGLE
|
|
phi.append(phi_i)
|
|
|
|
if norm_i and norm_i1 and norm_i2 and norm_i3:
|
|
psi_i = calc_dihedral(
|
|
pV[i], pV[i+1], pV[i+2], pV[i+3]) # C-N-Ca-C
|
|
else:
|
|
psi_i = INVALID_ANGLE
|
|
psi.append(psi_i)
|
|
|
|
if norm_i1 and norm_i2 and norm_i3 and norm_i4:
|
|
omega_i = calc_dihedral(
|
|
pV[i+1], pV[i+2], pV[i+3], pV[i+4]) # Ca-C-N-Ca
|
|
else:
|
|
omega_i = INVALID_ANGLE
|
|
omega.append(omega_i)
|
|
|
|
return (phi, psi, omega, bond_angle_NCaC, bond_angle_CaCN,
|
|
bond_angle_CNCa, bond_len_CN, bond_len_NCa, bond_len_CaC)
|
|
|
|
|
|
def read_protein_from_file(args):
|
|
'''Parse ProteinNet file and add PHIPSI entries'''
|
|
CS = compute_stats()
|
|
filename = args.fname
|
|
out_fname = filename+"_ext"
|
|
with open(out_fname, "w") as out_file:
|
|
with open(filename, "r") as file_pointer:
|
|
pcnt = 0
|
|
id_next = False
|
|
while(True):
|
|
next_line = file_pointer.readline()
|
|
print(next_line, file=out_file, end='')
|
|
if id_next:
|
|
id_next = False
|
|
print("ID", next_line, end='')
|
|
if next_line == '[ID]\n':
|
|
id_next = True
|
|
if next_line == '[TERTIARY]\n':
|
|
tertiary = []
|
|
# 3 dimension
|
|
for _axis in range(3):
|
|
next_line = file_pointer.readline()
|
|
print(next_line, file=out_file, end='')
|
|
tertiary.append(
|
|
[float(coord)/100 for coord in next_line.split()])
|
|
|
|
# write the PHI_PSI angles
|
|
print('[PHI_PSI]', file=out_file)
|
|
|
|
phi, psi, omega,\
|
|
bond_angle_NCaC, bond_angle_CaCN, bond_angle_CNCa,\
|
|
bond_len_CN, bond_len_NCa, bond_len_CaC = process_tertiary(
|
|
tertiary)
|
|
|
|
CS.update(bond_len_CN, bond_len_NCa, bond_len_CaC,
|
|
bond_angle_CNCa, bond_angle_NCaC,
|
|
bond_angle_CaCN)
|
|
print("process", pcnt, len(tertiary[0])//3, len(phi),
|
|
len(psi), "\n")
|
|
assert(len(tertiary[0])//3 == len(phi))
|
|
|
|
# print(gt, bl[gt])
|
|
if args.plot:
|
|
# only plot valid phi,psi pairs (not equal to -1)
|
|
phi_a = np.array(phi)
|
|
psi_a = np.array(psi)
|
|
phi_idx = set(
|
|
np.where(phi_a != INVALID_ANGLE)[0])
|
|
psi_idx = set(
|
|
np.where(psi_a != INVALID_ANGLE)[0])
|
|
val_idx = list(phi_idx.intersection(psi_idx))
|
|
plt.plot(phi_a[val_idx], psi_a[val_idx],
|
|
'o', markersize=0.5)
|
|
|
|
out_phi = " ".join([str(v) for v in phi])
|
|
print(out_phi, file=out_file)
|
|
out_psi = " ".join([str(v) for v in psi])
|
|
print(out_psi, file=out_file)
|
|
|
|
elif next_line == '\n':
|
|
pcnt += 1
|
|
|
|
elif next_line == '':
|
|
break
|
|
if args.plot:
|
|
# plt.show()
|
|
plt.savefig(args.fname+'_phipsi_plot.png', dpi=300,
|
|
transparent=True)
|
|
CS.print_stats()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
print(args)
|
|
read_protein_from_file(args)
|