# -*- coding: utf-8 -*-
"""Small module to read and write .fa.fai files.
A .fa.fai file contains a line for each chromosome in the corresponding
fasta file, each consisting of 5 tab separated columns:
1. name of the chromosome
2. length of the chromosome in bytes
3. starting position of the chromosomes in the fasta file (in bytes)
4. length of a line in the FASTA file (in characters)
5. length of a line in the FASTA file (in bytes) this includes trailing \\\\n
"""
from math import ceil
import os
[docs]
def read_fai(path):
"""Read and parse a .fa.fai (FASTA annotation index) file.
Arguments:
path (str): Path to a .fa.fai file
Returns:
list: A list of all fai entries as a list of tuples, each containing
(name, length in bytes, startpos (bytes), line_length, line_length_bytes)
as string.
"""
with open(path, "rt") as fai_file:
return [_parse(line.rstrip().split("\t")) for line in fai_file]
def _parse(line_items):
"""Parse each token of a fai line to the correct type.
The expected tokens are:
- name (str)
- length (int)
- start index (int)
- line length in characters (int)
- line length in bytes (int)
Arguments:
line_items (list): A list of tokens from a fai entry, as generated by
`line.strip("\n").split("\t")) for line in fai_file`, as strings.
Returns:
list: Fai entry with correctly typed items.
"""
return [line_items[0].encode()] + list(map(int, line_items[1:]))
[docs]
def write_fai(target, fai_entries):
"""Write specified fai to given target.
Arguments:
target (str): Target where the fai-file will be written to.
fai_entries (list): Each item should contain all needed tokens for a
valid fai-line. (name, length, start index,
line length (in characters), line length (in bytes)).
"""
if isinstance(target, str):
with open(target, 'wt') as fai_file:
for item in fai_entries:
chr_name, chr_len, chr_start, line_length, line_bytes = item
if isinstance(chr_name, bytes):
chr_name = chr_name.decode('ascii')
line = "\t".join([
chr_name,
str(chr_len),
str(chr_start),
str(line_length),
str(line_bytes),
])
fai_file.write(line + "\n")
else:
for item in fai_entries:
chr_name, chr_len, chr_start, line_length, line_bytes = item
if isinstance(chr_name, bytes):
chr_name = chr_name.decode('ascii')
line = "\t".join([
chr_name,
str(chr_len),
str(chr_start),
str(line_length),
str(line_bytes),
])
target.write(line + "\n")
[docs]
def write_chromosomes_as_fai(path, chromosomes, line_length):
"""Write a fai file from a given chromosome list.
Arguments:
path (str): Path where the fai-file will be written to.
chromosomes (list): Each item should contain information
about one chromosome in dinopy format (name, length, (start, stop)).
line_length (int): length of the lines (in charcters) in the FASTA file.
line_bytes (int): length of the lines (in bytes) in the FASTA file.
"""
fai = chromosome_info_to_fai(chromosomes, line_length=line_length)
with open(path, 'wt') as fai_file:
for name, length, interval_start, line_length, line_bytes in fai:
fai_file.write("{}\t{}\t{}\t{}\t{}\n".format(name, str(length), str(interval_start), str(line_length),
str(line_bytes)))
[docs]
def fai_entry_to_chromosome_info_entry(fai_entry):
"""Convert a fai-entry to dinopy chromosome info format.
Converts from :code:`[chr_name, chr_len, chr_start, line_length, line_length_bytes]` (file-view)
to :code:`[chr_name, chr_len, (chr_start, chr_stop)]` entries (genome-array-view i.e. without names, newlines, and '>').
Arguments:
fai_entry(list): Containing a valid fai entry :code:`(chr_name, chr_len, chr_start, line_length, line_length_bytes)`
Returns:
list: A valid chromosome info entry.
"""
chr_name, chr_len, chr_start, _, _ = fai_entry
# - (len(chr_name) + 2) to accomodate for the line consisting of '>', chr_name and '\n'
if isinstance(chr_name, str):
chr_name = chr_name.encode()
return [chr_name, chr_len,
(chr_start - len(chr_name) - (1 + 1),
chr_start - len(chr_name) - (1 + 1) + chr_len)]
[docs]
def fai_to_chromosome_info(fai_entries):
"""Convert the given fai-entries to dinopy chromosome info format.
Converts from a list of :code:`[chr_name, chr_len, chr_start, line_length, line_length_bytes]` (file-view)
to a list of :code:`[chr_name, chr_len, (chr_start, chr_stop)]` entries (genome-array-view i.e. without names, newlines, and '>')
Arguments:
fai_entries(Iterable): An iterable of valid fai-entries.
Returns:
list: A list containing valid chromosome info entries.
"""
chr_name, chr_len, chr_start, *_ = fai_entries[0]
chr_start_index = 0
chromosome_infos = []
for chr_name, chr_len, *_ in fai_entries:
if isinstance(chr_name, str):
chr_name = chr_name.encode()
chr_end_index = chr_start_index + chr_len
chromosome_infos.append([chr_name, chr_len, (chr_start_index, chr_end_index)])
chr_start_index = chr_end_index
return chromosome_infos
[docs]
def chromosome_info_to_fai(chr_info, line_length=80):
"""Convert dinopy chromosome info to fai-lines.
Arguments:
chr_info(list): Containing chromosome info entries in the format:
:code:`chr_name, chr_length, (chr_start, chr_stop)`
line_length(int): Line length in the FASTA file.
Returns:
list: List containing a valid fai-entry for each chromosome.
"""
fai = []
nameline_lengths = 0
for item in chr_info:
chr_name, chr_length, (chr_start, chr_stop) = item
nameline_lengths += len(chr_name) + 1 + 1 # literally: len(chr_name) + len('>') + len(newline)
chr_start_bytes = chr_start + nameline_lengths
if isinstance(chr_name, str):
chr_name = chr_name.encode()
fai.append([chr_name, chr_length, chr_start_bytes, line_length, line_length + 1])
nameline_lengths += ceil(chr_length / (line_length * 1)) # because we need that many additional newline bytes
return fai
[docs]
def is_valid_fai_entry(fai_entry):
"""Check if the given fai entry is valid.
Arguments:
fai_entry (collection): Collection that will be checked for fulfillment of
all prerequisites of a fai-entry.
Returns:
bool: True if the entry is valid, False if not.
Note:
A valid fai entry has the following structure:
1. name of the chromosome
2. length of the chromosome in bytes
3. starting position of the chromosomes in the FASTA file (in bytes)
4. length of a line in the FASTA file (in characters)
5. length of a line in the FASTA file (in bytes) this includes trailing \n
"""
try:
name, length, start, line_len_chars, line_len_bytes = fai_entry
except ValueError:
# unpacking the entry fails for more or less than 5 items
return False
except TypeError:
# unpacking the entry fails because entry is not iterable
return False
# check types, allow bytes and str as type for the name, rest must be int.
if (isinstance(name, str) or isinstance(name, bytes)) and isinstance(length, int) \
and isinstance(start, int) and isinstance(line_len_chars, int) and isinstance(line_len_bytes, int):
return True
else:
return False
[docs]
def is_valid_fai(fai_entries):
"""Check if the given list of potential fai entries is valid.
Arguments:
fai_entries (list): List of fai entries that will be validated.
Returns:
bool: True if all entries in the list are valid. False if not.
Note:
A valid fai entry has the following structure:
1. name of the chromosome
2. length of the chromosome in bytes
3. starting position of the chromosomes in the FASTA file (in bytes)
4. length of a line in the FASTA file (in characters)
5. length of a line in the FASTA file (in bytes) this includes trailing \n
"""
try:
for fai_entry in fai_entries:
# Check each line / entry. Break and return if one is invalid.
if not is_valid_fai_entry(fai_entry):
return False
except TypeError:
# Is raised, if the given object is not iterable -> invalid
return False
return True