compute_phi_psi.py (Source)

#!/usr/bin/env python3
import argparse
import matplotlib.pyplot as plt
import numpy as np
from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
INVALID_ANGLE = 10  # invalid angle value
def parse_args():
    parser = argparse.ArgumentParser(description='compute_phi_psi.py')
    parser.add_argument('fname')
    parser.add_argument('-plot', action='store_true')
    args = parser.parse_args()
    return args
class running_stats:
    '''this class computes mean and std values for a
    list of values across proteins'''
    def __init__(self, name_str):
        self.name = name_str
        self.x = 0.0
        self.x2 = 0.0
        self.n = 0
    def update(self, Vin):
        V = np.array(Vin)
        self.x += np.sum(V)
        self.x2 += np.sum(V**2)
        self.n += len(Vin)
    def print_stats(self):
        mu = self.x / self.n
        std = np.sqrt(self.x2/self.n - mu**2)
        print("bond length", self.name, mu, std)
class compute_stats:
    '''Compute mean and std values for bond lengths and bond angles'''
    def __init__(self):
        self.CN = running_stats("CN")
        self.NCa = running_stats("NCa")
        self.CaC = running_stats("CaC")
        self.CNCa = running_stats("CNCa")
        self.NCaC = running_stats("NCaC")
        self.CaCN = running_stats("CaCN")
    def update(self, v_CN, v_NCa, v_CaC, v_CNCa, v_NCaC, v_CaCN):
        self.CN.update(v_CN)
        self.NCa.update(v_NCa)
        self.CaC.update(v_CaC)
        self.CNCa.update(v_CNCa)
        self.NCaC.update(v_NCaC)
        self.CaCN.update(v_CaCN)
    def print_stats(self):
        self.CN.print_stats()
        self.NCa.print_stats()
        self.CaC.print_stats()
        self.CNCa.print_stats()
        self.NCaC.print_stats()
        self.CaCN.print_stats()
def process_tertiary(tertiary):
    '''compute the bond lengths, bond angles, and dihedral angles'''
    phi = []
    psi = []
    omega = []
    bond_angle_CNCa = []
    bond_angle_NCaC = []
    bond_angle_CaCN = []
    bond_len_NCa = []
    bond_len_CaC = []
    bond_len_CN = []
    # convert tertiary coords into Vectors
    pV = [vec for vec in map(lambda v: Vector(v[0], v[1], v[2]),
                             zip(tertiary[0], tertiary[1], tertiary[2]))]
    for i in range(0, len(pV), 3):
        # check for zero coords
        norm_im1 = False
        norm_i = False
        norm_i1 = False
        norm_i2 = False
        norm_i3 = False
        norm_i4 = False
        if i > 0 and pV[i-1].norm() > 0:
            norm_im1 = True
        if pV[i].norm() > 0:
            norm_i = True
        if pV[i+1].norm() > 0:
            norm_i1 = True
        if pV[i+2].norm() > 0:
            norm_i2 = True
        if i + 3 < len(pV) and pV[i+3].norm() > 0:
            norm_i3 = True
        if i + 3 < len(pV) and pV[i+4].norm() > 0:
            norm_i4 = True
        # compute bond lengths
        if norm_im1 and norm_i:
            blen_CN = (pV[i-1]-pV[i]).norm()
            bond_len_CN.append(blen_CN)
        if norm_i and norm_i1:
            blen_NCa = (pV[i]-pV[i+1]).norm()
            bond_len_NCa.append(blen_NCa)
        if norm_i1 and norm_i2:
            blen_CaC = (pV[i+1]-pV[i+2]).norm()
            bond_len_CaC.append(blen_CaC)
        # compute bond angles
        if norm_im1 and norm_i and norm_i1:
            theta_CNCa = calc_angle(pV[i-1], pV[i], pV[i+1])  # C-N-Ca
            bond_angle_CNCa.append(theta_CNCa)
        if norm_i and norm_i1 and norm_i2:
            theta_NCaC = calc_angle(pV[i], pV[i+1], pV[i+2])  # N-Ca-C
            bond_angle_NCaC.append(theta_NCaC)
        if norm_i1 and norm_i2 and norm_i3:
            theta_CaCN = calc_angle(pV[i+1], pV[i+2], pV[i+3])  # Ca-C-N
            bond_angle_CaCN.append(theta_CaCN)
        # compute dihedral angles
        if norm_im1 and norm_i and norm_i1 and norm_i2:
            phi_i = calc_dihedral(
                pV[i-1], pV[i], pV[i+1], pV[i+2])  # N-Ca-C-N
        else:
            phi_i = INVALID_ANGLE
        phi.append(phi_i)
        if norm_i and norm_i1 and norm_i2 and norm_i3:
            psi_i = calc_dihedral(
                pV[i], pV[i+1], pV[i+2], pV[i+3])  # C-N-Ca-C
        else:
            psi_i = INVALID_ANGLE
        psi.append(psi_i)
        if norm_i1 and norm_i2 and norm_i3 and norm_i4:
            omega_i = calc_dihedral(
                pV[i+1], pV[i+2], pV[i+3], pV[i+4])  # Ca-C-N-Ca
        else:
            omega_i = INVALID_ANGLE
        omega.append(omega_i)
    return (phi, psi, omega, bond_angle_NCaC, bond_angle_CaCN,
            bond_angle_CNCa, bond_len_CN, bond_len_NCa, bond_len_CaC)
def read_protein_from_file(args):
    '''Parse ProteinNet file and add PHIPSI entries'''
    CS = compute_stats()
    filename = args.fname
    out_fname = filename+"_ext"
    with open(out_fname, "w") as out_file:
        with open(filename, "r") as file_pointer:
            pcnt = 0
            id_next = False
            while(True):
                next_line = file_pointer.readline()
                print(next_line, file=out_file, end='')
                if id_next:
                    id_next = False
                    print("ID", next_line, end='')
                if next_line == '[ID]\n':
                    id_next = True
                if next_line == '[TERTIARY]\n':
                    tertiary = []
                    # 3 dimension
                    for _axis in range(3):
                        next_line = file_pointer.readline()
                        print(next_line, file=out_file, end='')
                        tertiary.append(
                            [float(coord)/100 for coord in next_line.split()])
                    # write the PHI_PSI angles
                    print('[PHI_PSI]', file=out_file)
                    phi, psi, omega,\
                        bond_angle_NCaC, bond_angle_CaCN, bond_angle_CNCa,\
                        bond_len_CN, bond_len_NCa, bond_len_CaC = process_tertiary(
                            tertiary)
                    CS.update(bond_len_CN, bond_len_NCa, bond_len_CaC,
                              bond_angle_CNCa, bond_angle_NCaC,
                              bond_angle_CaCN)
                    print("process", pcnt, len(tertiary[0])//3, len(phi),
                          len(psi), "\n")
                    assert(len(tertiary[0])//3 == len(phi))
                    # print(gt, bl[gt])
                    if args.plot:
                        # only plot valid phi,psi pairs (not equal to -1)
                        phi_a = np.array(phi)
                        psi_a = np.array(psi)
                        phi_idx = set(
                            np.where(phi_a != INVALID_ANGLE)[0])
                        psi_idx = set(
                            np.where(psi_a != INVALID_ANGLE)[0])
                        val_idx = list(phi_idx.intersection(psi_idx))
                        plt.plot(phi_a[val_idx], psi_a[val_idx],
                                 'o', markersize=0.5)
                    out_phi = " ".join([str(v) for v in phi])
                    print(out_phi, file=out_file)
                    out_psi = " ".join([str(v) for v in psi])
                    print(out_psi, file=out_file)
                elif next_line == '\n':
                    pcnt += 1
                elif next_line == '':
                    break
        if args.plot:
            # plt.show()
            plt.savefig(args.fname+'_phipsi_plot.png', dpi=300,
                        transparent=True)
        CS.print_stats()
if __name__ == "__main__":
    args = parse_args()
    print(args)
    read_protein_from_file(args)