Source code for dinopy.fai_io

# -*- coding: utf-8 -*-
"""Small module to read and write .fa.fai files.

A .fa.fai file contains a line for each chromosome in the corresponding
fasta file, each consisting of 5 tab separated columns:

1. name of the chromosome
2. length of the chromosome in bytes
3. starting position of the chromosomes in the fasta file (in bytes)
4. length of a line in the FASTA file (in characters)
5. length of a line in the FASTA file (in bytes) this includes trailing \\\\n

"""
from math import ceil
import os


[docs] def read_fai(path): """Read and parse a .fa.fai (FASTA annotation index) file. Arguments: path (str): Path to a .fa.fai file Returns: list: A list of all fai entries as a list of tuples, each containing (name, length in bytes, startpos (bytes), line_length, line_length_bytes) as string. """ with open(path, "rt") as fai_file: return [_parse(line.rstrip().split("\t")) for line in fai_file]
def _parse(line_items): """Parse each token of a fai line to the correct type. The expected tokens are: - name (str) - length (int) - start index (int) - line length in characters (int) - line length in bytes (int) Arguments: line_items (list): A list of tokens from a fai entry, as generated by `line.strip("\n").split("\t")) for line in fai_file`, as strings. Returns: list: Fai entry with correctly typed items. """ return [line_items[0].encode()] + list(map(int, line_items[1:]))
[docs] def write_fai(target, fai_entries): """Write specified fai to given target. Arguments: target (str): Target where the fai-file will be written to. fai_entries (list): Each item should contain all needed tokens for a valid fai-line. (name, length, start index, line length (in characters), line length (in bytes)). """ if isinstance(target, str): with open(target, 'wt') as fai_file: for item in fai_entries: chr_name, chr_len, chr_start, line_length, line_bytes = item if isinstance(chr_name, bytes): chr_name = chr_name.decode('ascii') line = "\t".join([ chr_name, str(chr_len), str(chr_start), str(line_length), str(line_bytes), ]) fai_file.write(line + "\n") else: for item in fai_entries: chr_name, chr_len, chr_start, line_length, line_bytes = item if isinstance(chr_name, bytes): chr_name = chr_name.decode('ascii') line = "\t".join([ chr_name, str(chr_len), str(chr_start), str(line_length), str(line_bytes), ]) target.write(line + "\n")
[docs] def write_chromosomes_as_fai(path, chromosomes, line_length): """Write a fai file from a given chromosome list. Arguments: path (str): Path where the fai-file will be written to. chromosomes (list): Each item should contain information about one chromosome in dinopy format (name, length, (start, stop)). line_length (int): length of the lines (in charcters) in the FASTA file. line_bytes (int): length of the lines (in bytes) in the FASTA file. """ fai = chromosome_info_to_fai(chromosomes, line_length=line_length) with open(path, 'wt') as fai_file: for name, length, interval_start, line_length, line_bytes in fai: fai_file.write("{}\t{}\t{}\t{}\t{}\n".format(name, str(length), str(interval_start), str(line_length), str(line_bytes)))
[docs] def fai_entry_to_chromosome_info_entry(fai_entry): """Convert a fai-entry to dinopy chromosome info format. Converts from :code:`[chr_name, chr_len, chr_start, line_length, line_length_bytes]` (file-view) to :code:`[chr_name, chr_len, (chr_start, chr_stop)]` entries (genome-array-view i.e. without names, newlines, and '>'). Arguments: fai_entry(list): Containing a valid fai entry :code:`(chr_name, chr_len, chr_start, line_length, line_length_bytes)` Returns: list: A valid chromosome info entry. """ chr_name, chr_len, chr_start, _, _ = fai_entry # - (len(chr_name) + 2) to accomodate for the line consisting of '>', chr_name and '\n' if isinstance(chr_name, str): chr_name = chr_name.encode() return [chr_name, chr_len, (chr_start - len(chr_name) - (1 + 1), chr_start - len(chr_name) - (1 + 1) + chr_len)]
[docs] def fai_to_chromosome_info(fai_entries): """Convert the given fai-entries to dinopy chromosome info format. Converts from a list of :code:`[chr_name, chr_len, chr_start, line_length, line_length_bytes]` (file-view) to a list of :code:`[chr_name, chr_len, (chr_start, chr_stop)]` entries (genome-array-view i.e. without names, newlines, and '>') Arguments: fai_entries(Iterable): An iterable of valid fai-entries. Returns: list: A list containing valid chromosome info entries. """ chr_name, chr_len, chr_start, *_ = fai_entries[0] chr_start_index = 0 chromosome_infos = [] for chr_name, chr_len, *_ in fai_entries: if isinstance(chr_name, str): chr_name = chr_name.encode() chr_end_index = chr_start_index + chr_len chromosome_infos.append([chr_name, chr_len, (chr_start_index, chr_end_index)]) chr_start_index = chr_end_index return chromosome_infos
[docs] def chromosome_info_to_fai(chr_info, line_length=80): """Convert dinopy chromosome info to fai-lines. Arguments: chr_info(list): Containing chromosome info entries in the format: :code:`chr_name, chr_length, (chr_start, chr_stop)` line_length(int): Line length in the FASTA file. Returns: list: List containing a valid fai-entry for each chromosome. """ fai = [] nameline_lengths = 0 for item in chr_info: chr_name, chr_length, (chr_start, chr_stop) = item nameline_lengths += len(chr_name) + 1 + 1 # literally: len(chr_name) + len('>') + len(newline) chr_start_bytes = chr_start + nameline_lengths if isinstance(chr_name, str): chr_name = chr_name.encode() fai.append([chr_name, chr_length, chr_start_bytes, line_length, line_length + 1]) nameline_lengths += ceil(chr_length / (line_length * 1)) # because we need that many additional newline bytes return fai
[docs] def is_valid_fai_entry(fai_entry): """Check if the given fai entry is valid. Arguments: fai_entry (collection): Collection that will be checked for fulfillment of all prerequisites of a fai-entry. Returns: bool: True if the entry is valid, False if not. Note: A valid fai entry has the following structure: 1. name of the chromosome 2. length of the chromosome in bytes 3. starting position of the chromosomes in the FASTA file (in bytes) 4. length of a line in the FASTA file (in characters) 5. length of a line in the FASTA file (in bytes) this includes trailing \n """ try: name, length, start, line_len_chars, line_len_bytes = fai_entry except ValueError: # unpacking the entry fails for more or less than 5 items return False except TypeError: # unpacking the entry fails because entry is not iterable return False # check types, allow bytes and str as type for the name, rest must be int. if (isinstance(name, str) or isinstance(name, bytes)) and isinstance(length, int) \ and isinstance(start, int) and isinstance(line_len_chars, int) and isinstance(line_len_bytes, int): return True else: return False
[docs] def is_valid_fai(fai_entries): """Check if the given list of potential fai entries is valid. Arguments: fai_entries (list): List of fai entries that will be validated. Returns: bool: True if all entries in the list are valid. False if not. Note: A valid fai entry has the following structure: 1. name of the chromosome 2. length of the chromosome in bytes 3. starting position of the chromosomes in the FASTA file (in bytes) 4. length of a line in the FASTA file (in characters) 5. length of a line in the FASTA file (in bytes) this includes trailing \n """ try: for fai_entry in fai_entries: # Check each line / entry. Break and return if one is invalid. if not is_valid_fai_entry(fai_entry): return False except TypeError: # Is raised, if the given object is not iterable -> invalid return False return True