add_indels.py 5.8 KB
Newer Older
Carine Rey's avatar
add exe  
Carine Rey committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>

# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.

# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".

# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.

# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.

# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.


import argparse
import sys
import os, random
import logging

from Bio import AlignIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="add_indels.py",
                                 description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
                    help="debug mode",
                    default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-a', "--ali", type=argparse.FileType('r'),
                             help='Alignment filename', required=True),
requiredOptions.add_argument('-o', '--output', type=str,
                   help="Output name", required=True)
requiredOptions.add_argument('-p', '--indel_p', type=float,
                   help="indel proportion", required=True)
requiredOptions.add_argument('-c', '--codon', action="store_true",
                   help="is codon data")
60 61
requiredOptions.add_argument('-r', '--random_seed', type=int,
                             help="random seed", required=False)
Carine Rey's avatar
add exe  
Carine Rey committed
62 63 64 65 66 67 68 69 70 71
##############


### Option parsing
args = parser.parse_args()

AliFile = args.ali
OutFile = args.output
IndelP = args.indel_p
IsCodon = args.codon
72 73
rseed = args.random_seed if args.random_seed else random.randint(0, sys.maxsize)
random.seed(rseed)
Carine Rey's avatar
add exe  
Carine Rey committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93

#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("add_indels")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
    ch.setLevel(logging.DEBUG)
else:
    ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)

logger.debug(sys.argv)

94 95
logger.info("Random seed is %s", rseed)

Carine Rey's avatar
add exe  
Carine Rey committed
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148

#===================================================================================================
# Read input alignment
#===================================================================================================
try:
    ali = AlignIO.read(AliFile, "fasta")

except Exception as exc:
    logger.error(str(exc))
    sys.exit(1)

logger.info("Ali (%s) ok after checking", AliFile.name)


#===================================================================================================
# Add indels in the ali
#===================================================================================================
def add_indels(string, p):
    if p > 0:
        if not IsCodon:
            l = len(string)
            nb_i = min(int(l*p), l)
            i_pos = random.sample(range(l), nb_i)
        else:
            l = int(len(string) / 3)
            nb_i = min(int(l*p), l)
            i_pos = random.sample(range(l), nb_i)
            i_pos = [ i*3 for i in i_pos]
            i_pos = [ [i,i+1,i+2] for i in i_pos]
            i_pos = [i for i_l in i_pos for i in i_l]
            print(l)
            print(nb_i)
            print(i_pos)

        string_l = list(string)
        for i in i_pos:
            string_l[i] = "-"
        string= "".join(string_l)
    return(string)


new_ali = []
for s in ali:
    seq = add_indels(s.seq,IndelP)
    new_s = SeqRecord(Seq(seq, generic_dna), id=s.id, description="")
    new_ali.append(new_s)

logger.info("Indel replacement ok")

#===================================================================================================
# Create output files
#===================================================================================================
AlignIO.write(MultipleSeqAlignment(new_ali), OutFile, "fasta")