#!/bin/bash
# Donata Kirchner, December 2010

# Call various structure validation programs and manage output processing

# *** Variable Initiation ***
# ***************************
# ARRAY INDICES:
# 0: ProSa2003, 1: Verify3D, 2: WhatCheck, 3: ProQ, 4: ProcheckNMR,
# 5: MolProbity, 6: PDB Validation Suite, 7: RPF Web Server
# ***************************
use_program=( 1 1 1 1 1 1 1 1 ) # set to '0' if a program is to be used
use_programs=(" ")
program_names=( "ProSa2003" "Verify3D" "WhatCheck" "ProQ" "ProcheckNMR" "MolProbity" "PDB Validation Suite" \
                "RPF" )
paths=(" ")     # the paths to the executables
program_path=""
program_file="" # the file containing the program paths in the same order as in 'program_names'
ask=false       # whether to ask the user for input in case a program is not found
arguments=(" ")
arg_index=0
rpf_cmd_file="" # file containing info required by RPF
rpf_value=""
timestamp=`date +%H%M%d%m`

# file names set by this script:
prosa_cmd="prosa.cmd"
prosa_out="prosa_out"
prosa_log="prosa.log"
prosa_plot="prosa2003_energy"
output_dir="validation_output$timestamp"
verify_env_out="verify.env"
verify_plot="verify_plot.out"
verify_log="verify.log"
proq_out="ProQ.out"
procheck_out="ProcheckNMR.out"
whatcheck_log="whatcheck.log"
molprobity_out="molprobity.out"
molprobity_rama_pdf="molprobity_Rama.pdf"
pdb="PDB_validation"
overview_file="CYANA_Validation_Overview.log"
error_collector="all_errors.log"


# *** Get Command Line Arguments ***
# The options refer to:
# input,ssf,psipred,ProSa2003,procheck,ProQ,whatcheck,verify3d,molprobity,PDB Validation Suite,RPF,programs,ask
while getopts :i:s:p:P:c:q:w:v:o:m:r:d:f:a option
do
   case $option in
      i)   input_file=$OPTARG
           ;;
      s)   ssf_file=$OPTARG
           ;;
      p)   psipred_file=$OPTARG
           ;;
      P)   paths[0]=$OPTARG
           ;;
      v)   paths[1]=$OPTARG
           ;;
      w)   paths[2]=$OPTARG
           ;;
      q)   paths[3]=$OPTARG
           ;;
      c)   paths[4]=$OPTARG
           ;;
      m)   paths[5]=$OPTARG
           ;;
      d)   paths[6]=$OPTARG
           ;;
      r)   program_file=$OPTARG
           ;;
      o)   programs=$OPTARG
           ;;
      f)   rpf_cmd_file=$OPTARG
           ;;
      a)   ask=true
           ;;
      '?') echo "$0: invalid option -$OPTARG" >&2
           exit 1
           ;;
   esac
done

# those files will be moved to the output directory:
files_to_move=( "$prosa_out.slp" $verify_plot "clean.log" "tmp.sum" "pdbout.txt" $proq_out $procheck_out \
               $verify_log $prosa_log $overview_file "verify.ps" "$prosa_plot.ps" $whatcheck_log \
               "chi_Gfactors.out" "rama_Gfactors.out" $molprobity_out $molprobity_rama_pdf \
               $pdb_out "TMP*.*" "PDB_validation.*" $error_collector )

# those files will be deleted once the calculations are done:
files_to_delete=( $input_file $prosa_cmd $verify_env_out $ssf_file "PDBFILE" "ALTERR.LOG" $psipred_file \
                 "*.rin" "check.db" "pdbout.tex" "SCATTER.DAT" "SCATTER.SCC" \
                 "TAPEIN.DAT" "TAPEOUT.DAT" "TEXTABLE.DAT" "TEXSTORE.DAT" "sct*.eps" \
                 "mplot.*" $procheck_out "procheck_nmr.prm" "ps.number" "rmsdev.log" "secstr.log" \
                 "tmp*.rin" "tmp.ave" "tmp.edt" "tmp.edt" "tmp.new" "tmp.rms" "tplot.log" "vplot.log" \
                 "SEQUENCE.DAT" "fort.14" "procheck.out" "TMP*.rin" $program_file "tmp.bmrb" )


# *** Function Definitions ***
# ****************************

# move & remove files before the script exits
function clean_up {
   if [ ! -d $output_dir ] ; then
      mkdir $output_dir 2>>$error_collector
   fi

   rm -f ${files_to_delete[*]}

   for file in ${files_to_move[*]}
   do
      if [ -e $file ] ; then
         mv $file $output_dir
      fi
   done

   # create sub-directories for the large number of plot files that may have been created
   if [ ${use_program[2]} = 0 ] ; then
      mkdir "$output_dir/whatcheck_plots"
      mv eps*.eps "$output_dir/whatcheck_plots" 2>>$error_collector
   fi

   if [ ${use_program[4]} = 0 ] ; then
      rm -f TMP*ps  # may have been generated by the PDB validation suite (if it contains procheck)
      mkdir "$output_dir/procheck_plots" 2>>$error_collector
      mv tmp*.ps "$output_dir/procheck_plots" 2>>$error_collector
   fi
}


# produce a three options option menu in case a validation program is not found
# usage: option_menu <program name>
# (will only be displayed if 'ask' was set to True)
function option_menu {
   program=$1
   display=$2

   printf "\n*** The program $program was not found. ***\n"
   if [ $display == true ] ; then
      PS3='choice? '
      select alt in "Continue without $program" "Enter path to $program" "Abort validation"
      do
        if [ $REPLY == 1 ] ; then
           return 1
        elif [ $REPLY == 2 ] ; then
           printf "\n*** Path to $program: " $program_path ; read program_path
           return 2
        elif [ $REPLY == 3 ] ; then
           clean_up
           exit 1
        else
           printf "\n*** Your input ('$REPLY') is invalid. Please enter either 1, 2, or 3.\n\n"
        fi
      done
   else
      printf "   *** Continuing without $program ***\n"
      return 3
   fi
}

# check whether a program is installed
# usage: check_program <program name variable> <program name for screen output> <program usage status variable>
# <program name value>
# We have to pass the program name once as a variable so that it can be assigned a new value, and once as a
# value, otherwise the string that denotes the program name variable's name will be looked for by 'type'
function check_program {
   if ! type -a $4 &>>$error_collector ; then
      option_menu $2 $5
      local return_val=$?  # the value that 'option_menu' returned
      if [ $return_val == 1 ] ; then
         eval "$3=1"
      elif [ $return_val == 2 ] ; then
         eval "$1=$program_path"
      elif [ $return_val == 3 ] ; then
         eval "$3=1"
      fi
   fi
}



# ********************
# *** MAIN PROGRAM ***
# ********************

# *** Determine which Programs to use ***
# Here we're getting the names of the programs to use from the string stored
# in 'programs'. The old approach was passing the paths of the required programs
# as command line arguments, but as some of those paths were quite long this failed
# at times, as the command line could not hold all those characters.
oIFS=$IFS
IFS=','
use=($programs)
IFS=$oIFS
indx=0

# Reading the program paths from an external file. The order of the lines
# is crucial - make sure that it is as follows:
# prosa, verify, whatcheck, proQ, procheckNMR, molprobity, PDB validation suite
while read line
do
   paths[$((indx++))]=$line
done < "$program_file"

for prog in ${use[*]}
do
   case $prog in
      prosa)         use_indices[$((indx++))]=1
                     ;;
      verify)        use_indices[$((indx++))]=2
                     ;;
      whatcheck)     use_indices[$((indx++))]=3
                     ;;
      proq)          use_indices[$((indx++))]=4
                     ;;
      procheck)      use_indices[$((indx++))]=5
                     ;;
      molprobity)    use_indices[$((indx++))]=6
                     ;;
      validation-v8) use_indices[$((indx++))]=7
                     ;;
      rpf)           use_indices[$((indx++))]=8
   esac
done


# *** Check Program Availability/Determine Location ***
for i in ${use_indices[*]}
do
   ((--i)) # earlier on we used numbers that were too large by one (<-- more natural for the user)
   if [[ $i -lt 8 && $i -gt -1 ]] ; then
      use_program[$i]=0  # 0 --> indicator that the program is to be used
      check_program paths[$i] "${program_names[$i]}" use_program[$i] ${paths[$i]} $ask
   fi
done


# *** Run Validation Programs ***
# *******************************

# ProSa2003
if [ ${use_program[0]} = 0 ] ; then
   # ProSa wants a command file as input --> write this now
   printf "read pdb $input_file protein\ninit zscore\nzscore protein $prosa_out\n" > $prosa_cmd
   printf "analyse energy protein\ngraph title Combined Energy\nplot\nexport plot $prosa_plot\nquit" >> $prosa_cmd

   echo "*** Running ProSa2003 ***"
   echo "*** Running ProSa2003 ***" >>$error_collector
   ( ${paths[0]} prosa.cmd > $prosa_log 2>&1 )
   arguments[$((arg_index++))]="-prosa $prosa_out.slp"
fi

# Verify3D
if [ ${use_program[1]} = 0 ] ; then
   # we have to run two separate programs, thus we need the path to their directory
   # ${paths[1]} points only to the verify_3d executable
   verify_env=`dirname ${paths[1]}`

   echo "*** Running Verify3D ***"
   ($verify_env/environments > $verify_log 2>&1)<<EOF
      $input_file
      $ssf_file
      $verify_env_out
      A
EOF
   ($verify_env/verify_3d >> $verify_log 2>&1)<<EOF
      $verify_env_out
      $verify_env/3d_1d.tab
      $verify_plot
      21
      0
EOF
   arguments[$((arg_index++))]="-verify $verify_plot"
fi

# WhatCheck
if [ ${use_program[2]} = 0 ] ; then
   echo "*** Running WhatCheck ***"
   echo "*** Running WhatCheck ***" >>$error_collector
   # The blank line means 'Enter' => WhatCheck will consider all conformers in the input file
   ( ${paths[2]} $input_file > $whatcheck_log 2>&1 )<<EOF

EOF
   arguments[$((arg_index++))]="-what pdbout.txt"
fi

# ProQ
if [ ${use_program[3]} = 0 ] ; then
   echo "*** Running ProQ ***"
   echo "*** Running ProQ ***" >>$error_collector
   ( ${paths[3]} -model $input_file -ss $psipred_file > $proq_out 2>&1 )
   arguments[$((arg_index++))]="-proq $proq_out"
fi

# ProcheckNMR
if [ ${use_program[4]} = 0 ] ; then
   echo "*** Running ProcheckNMR ***"
   echo "*** Running ProcheckNMR ***" >>$error_collector
   ( ${paths[4]} $input_file > $procheck_out )
   arguments[$((arg_index++))]="-procheck tmp.sum"
fi

# MolProbity
if [ ${use_program[5]} = 0 ] ; then
   echo "*** Running MolProbity ***"
   echo "*** Running MolProbity ***" >>$error_collector

   # ${paths[5]} is currently the path to one of the scripts in the directory 'cmdline'. We remove the script
   # name to obtain the path to 'cmdline', and use this path in the invocation of the two scripts we need.
   paths[5]=`dirname ${paths[5]}`

   # 'oneline-analysis' operates on directories, thus we have to provide it with one:
   pdb_dir="tmp_dir"
   mkdir $pdb_dir ; cp $input_file $pdb_dir
   ( ${paths[5]}/oneline-analysis $pdb_dir > $molprobity_out 2>>$error_collector)
   rm -rf $pdb_dir

   # 'residue-analysis' operates on files
   ( ${paths[5]}/residue-analysis $input_file >> $molprobity_out 2>>$error_collector)

   # We need to move up another level in the directory tree.
   # this program will produce the Ramachandran plot .pdf file
   paths[5]=`dirname ${paths[5]}`
   ( java -Xmx256m -cp ${paths[5]}/lib/chiropraxis.jar chiropraxis.rotarama.Ramalyze -pdf $input_file $molprobity_rama_pdf \
        2>>$error_collector )

   # note that there is NO space after the comma!
   arguments[$((arg_index++))]="-molprobity $molprobity_out,$molprobity_rama_pdf"
fi

# PDB Validation Suite
if [ ${use_program[6]} = 0 ] ; then
   echo "*** Running the PDB Validation Suite ***"
   echo "*** Running the PDB Validation Suite ***" >>$error_collector

   slash=`awk -v path=${paths[6]} BEGIN'{print index(path,"/")}'`

   # if there is a '/' in the path to the program: assume that 'validation-v8'
   # is not in a directory on PATH, thus set the environment variables
   if [ ! $slash = 0 ] ; then
      RCSBROOT=`dirname ${paths[6]}`
      RCSBROOT=`dirname $RCSBROOT`
      export RCSBROOT
      PATH="$RCSBROOT/bin:"$PATH
      export PATH
   fi

   ( ${paths[6]} -f $input_file -o 0 -adit 2>>$error_collector )
   mv TMP.letter $pdb.out 2>>$error_collector
   mv validation.alignment $pdb.alignment 2>>$error_collector
   mv validation.err $pdb.err 2>>$error_collector

   # note that there is NO space after the comma!
   arguments[$((arg_index++))]="-pdb $pdb.out,$pdb.err"
fi

# RPF
if [ ${use_program[7]} = 0 ] ; then
   echo "*** Contacting the RPF Web Server ***"
   echo "*** Contacting the RPF Web Server ***" >>$error_collector

   if [ ! -f $rpf_cmd_file ] ; then
      echo " *** Error: There was no RPF information file! ***"
   else
      rpf_value=` ${paths[7]} -i "$rpf_cmd_file" `
      if ! echo $rpf_value | egrep [0-9.]+ >>$error_collector; then
         rpf_value='error'
      fi
      arguments[$((arg_index++))]="-rpf $rpf_value"
   fi
fi

# as this is the last argument to be added we no longer need to increment 'arg_index'
arguments[$arg_index]="-out $overview_file"

# this script will process the results
$CYANALIB/macro/validation_output.pl ${arguments[*]} 2>>$error_collector

# *** Tidy up ***
clean_up
