multitax
View Source
__version__ = "1.3.1" __all__ = ( 'CustomTx', 'DummyTx', 'GreengenesTx', 'GtdbTx', 'NcbiTx', 'OttTx', 'SilvaTx', ) from .customtx import CustomTx from .dummytx import DummyTx from .greengenestx import GreengenesTx from .gtdbtx import GtdbTx from .ncbitx import NcbiTx from .otttx import OttTx from .silvatx import SilvaTx
View Source
class CustomTx(MultiTax): _required_cols = ["node", "parent"] _possible_cols = ["node", "parent", "rank", "name"] def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs): """ CustomTx() Parameters: * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" * **sep** *[str]*: Separator of fields * **\*\*kwargs** defined at `multitax.multitax.MultiTax` Example: tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) """ self._cols = self._parse_cols(cols) self._sep = sep super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'CustomTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return {} def _parse(self, fhs, **kwargs): nodes = {} ranks = {} names = {} for source, fh in fhs.items(): for line in fh: try: fields = line.rstrip().split(self._sep) except: fields = line.decode().rstrip().split(self._sep) node = fields[self._cols["node"]] nodes[node] = fields[self._cols["parent"]] if "name" in self._cols: names[node] = fields[self._cols["name"]] if "rank" in self._cols: ranks[node] = fields[self._cols["rank"]] return nodes, ranks, names def _parse_cols(self, cols): if isinstance(cols, list): cols = {c: i for i, c in enumerate(cols)} for rc in self._required_cols: if rc not in cols: raise ValueError(rc + " is a required column") for c in cols: if c not in self._possible_cols: raise ValueError(c + " is not a valid column: " + ",".join(self._possible_cols)) return cols
View Source
def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs): """ CustomTx() Parameters: * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" * **sep** *[str]*: Separator of fields * **\*\*kwargs** defined at `multitax.multitax.MultiTax` Example: tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) """ self._cols = self._parse_cols(cols) self._sep = sep super().__init__(**kwargs)
CustomTx()
Parameters:
- cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
- sep [str]: Separator of fields
- **kwargs defined at
multitax.multitax.MultiTax
Example:
tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
View Source
class DummyTx(MultiTax): def __init__(self, **kwargs): """ DummyTx() - Dummy empty taxonomy Parameters: * \*\*kwargs defined at `multitax.multitax.MultiTax` """ super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'DummyTx({})'.format(', '.join(stats))
View Source
def __init__(self, **kwargs): """ DummyTx() - Dummy empty taxonomy Parameters: * \*\*kwargs defined at `multitax.multitax.MultiTax` """ super().__init__(**kwargs)
View Source
class GreengenesTx(MultiTax): _default_urls = [ "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_13_5/gg_13_5_taxonomy.txt.gz"] _rank_codes = [("k__", "kingdom"), ("p__", "phylum"), ("c__", "class"), ("o__", "order"), ("f__", "family"), ("g__", "genus"), ("s__", "species")] def __init__(self, **kwargs): # forwards.tsv self._forwards = {} super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'GreengenesTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return {} def _parse(self, fhs, **kwargs): nodes = {} ranks = {} names = {} for source, fh in fhs.items(): for line in fh: try: _, lineage = line.rstrip().split('\t') except: _, lineage = line.decode().rstrip().split('\t') lin = lineage.split("; ") for i in range(len(lin))[::-1]: # assert rank assert lin[i][:3] == self._rank_codes[i][0] # taxid = "c__Deinococci", rank = "class", name = "Deinococci" taxid = lin[i] name = lin[i][3:] if not name: continue # empty entry "s__" rank = self._rank_codes[i][1] if i == 0: parent_taxid = self._default_root_node else: parent_taxid = lin[i-1] if taxid not in nodes: nodes[taxid] = parent_taxid names[taxid] = name ranks[taxid] = rank return nodes, ranks, names
View Source
def __init__(self, **kwargs): # forwards.tsv self._forwards = {} super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
class GtdbTx(MultiTax): _default_urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz"] _rank_codes = [("d__", "domain"), ("p__", "phylum"), ("c__", "class"), ("o__", "order"), ("f__", "family"), ("g__", "genus"), ("s__", "species")] def __init__(self, **kwargs): super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'GtdbTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): translated_nodes = {} if target_tax.__class__.__name__ == "NcbiTx": if files: fhs = open_files(files) else: _urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz"] fhs = download_files( urls=urls if urls else _urls, retry_attempts=3) for source, fh in fhs.items(): for file in fh.getmembers(): with fh.extractfile(file) as ext: for line in ext: try: fields = line.rstrip().split('\t') except: fields = line.decode().rstrip().split('\t') # skip header if fields[0] == "accession": continue # 0 accession # 16 gtdb_taxonomy # 77 ncbi_taxid # 78 ncbi_taxonomy # 79 ncbi_taxonomy_unfiltered # Create lineage from leaf node based on target tax (instead of using field 78) # to accomodate changes in newest versions of the NCBI tax ncbi_leaf_node = target_tax.latest(fields[77]) if ncbi_leaf_node != target_tax.undefined_node: ncbi_nodes = target_tax.lineage(ncbi_leaf_node, ranks=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species"]) else: continue # Build GTDB lineage from leaf (species on given lineage) # to accomodate possible changes in the loaded tax gtdb_leaf_node = fields[16].split(";")[-1] if gtdb_leaf_node != self.undefined_node: gtdb_nodes = self.lineage(gtdb_leaf_node, ranks=[ "domain", "phylum", "class", "order", "family", "genus", "species"]) else: continue # Match ranks for i, gtdb_n in enumerate(gtdb_nodes): if ncbi_nodes[i] != target_tax.undefined_node and gtdb_n != self.undefined_node: if gtdb_n not in translated_nodes: translated_nodes[gtdb_n] = set() translated_nodes[gtdb_n].add(ncbi_nodes[i]) else: warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return translated_nodes def _parse(self, fhs, **kwargs): nodes = {} ranks = {} names = {} for source, fh in fhs.items(): for line in fh: try: _, lineage = line.rstrip().split('\t') except: _, lineage = line.decode().rstrip().split('\t') lin = lineage.split(";") for i in range(len(lin))[::-1]: # assert rank assert lin[i][:3] == self._rank_codes[i][0] # taxid = "c__Deinococci", rank = "class", name = "Deinococci" taxid = lin[i] name = lin[i][3:] # empty entry "s__" if not name: continue rank = self._rank_codes[i][1] if i == 0: parent_taxid = self._default_root_node else: parent_taxid = lin[i-1] if taxid not in nodes: nodes[taxid] = parent_taxid names[taxid] = name ranks[taxid] = rank return nodes, ranks, names
View Source
def __init__(self, **kwargs): super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
class NcbiTx(MultiTax): _default_urls = ["ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"] def __init__(self, **kwargs): self._merged = {} self._extended_name_nodes = {} super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'NcbiTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): translated_nodes = {} if target_tax.__class__.__name__ == "GtdbTx": if files: fhs = open_files(files) else: _urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz"] fhs = download_files( urls=urls if urls else _urls, retry_attempts=3) for source, fh in fhs.items(): for file in fh.getmembers(): with fh.extractfile(file) as ext: for line in ext: try: fields = line.rstrip().split('\t') except: fields = line.decode().rstrip().split('\t') # skip header if fields[0] == "accession": continue # 0 accession # 16 gtdb_taxonomy # 77 ncbi_taxid # 78 ncbi_taxonomy # 79 ncbi_taxonomy_unfiltered # print(fields) # Build GTDB lineage from leaf (species on given lineage) # to accomodate possible changes in the loaded tax gtdb_leaf_node = fields[16].split(";")[-1] if gtdb_leaf_node != target_tax.undefined_node: gtdb_nodes = target_tax.lineage(gtdb_leaf_node, ranks=[ "domain", "phylum", "class", "order", "family", "genus", "species"]) else: continue # Build NCBI lineage from leaf ncbi_leaf_node = self.latest(fields[77]) if ncbi_leaf_node != self.undefined_node: # Additional add connection from leaf to species on GTDB # that could represent strain, etc on NCBI tax if ncbi_leaf_node not in translated_nodes: translated_nodes[ncbi_leaf_node] = set() translated_nodes[ncbi_leaf_node].add( gtdb_leaf_node) ncbi_nodes = self.lineage(ncbi_leaf_node, ranks=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species"]) else: continue # Match ranks for i, ncbi_n in enumerate(ncbi_nodes): if gtdb_nodes[i] != target_tax.undefined_node and ncbi_n != self.undefined_node: if ncbi_n not in translated_nodes: translated_nodes[ncbi_n] = set() translated_nodes[ncbi_n].add(gtdb_nodes[i]) else: warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return translated_nodes def _parse(self, fhs, **kwargs): fhs_list = list(fhs.values()) # One element tar.gz -> taxdump.tar.gz if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"): nodes, ranks, names, self._merged = self._parse_taxdump( fhs_list[0], extended_names=kwargs["extended_names"]) else: # nodes.dmp nodes, ranks = self._parse_nodes(fhs_list[0]) # [names.dmp] if len(fhs) >= 2: names = self._parse_names( fhs_list[1], extended_names=kwargs["extended_names"]) else: names = {} # [merged.dmp] if len(fhs) == 3: self._merged = self._parse_merged(fhs_list[2]) return nodes, ranks, names def _parse_merged(self, fh): merged = {} for line in fh: try: old_taxid, _, new_taxid, _ = line.split('\t', 3) except: old_taxid, _, new_taxid, _ = line.decode().split('\t', 3) merged[old_taxid] = new_taxid return merged def _parse_names(self, fh, extended_names): names = {} for line in fh: try: node, name, _, name_class = line.split('\t|\t') except: node, name, _, name_class = line.decode().split('\t|\t') if name_class.replace('\t|\n', '') == "scientific name": names[node] = name elif extended_names: if name not in self._extended_name_nodes: self._extended_name_nodes[name] = [] self._extended_name_nodes[name].append(node) return names def _parse_nodes(self, fh): nodes = {} ranks = {} for line in fh: try: taxid, parent_taxid, rank, _ = line.split('\t|\t', 3) except: taxid, parent_taxid, rank, _ = line.decode().split('\t|\t', 3) ranks[taxid] = rank nodes[taxid] = parent_taxid return nodes, ranks def _parse_taxdump(self, fh_taxdump, extended_names): with fh_taxdump.extractfile('nodes.dmp') as fh_nodes: nodes, ranks = self._parse_nodes(fh_nodes) with fh_taxdump.extractfile('names.dmp') as fh_names: names = self._parse_names(fh_names, extended_names=extended_names) with fh_taxdump.extractfile('merged.dmp') as fh_merged: merged = self._parse_merged(fh_merged) return nodes, ranks, names, merged def latest(self, node: str): n = super().latest(node) if n == self.undefined_node: n = self.merged(node) return n def merged(self, node: str): """ Returns relative entry from the merged.dmp file of a given node. """ if node in self._merged: return self._merged[node] else: return self.undefined_node def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): """ Search node by exact or partial name. Default order (can be skipped with **force_extended=True**): 1) Search names defined as "scientific name" on nodes.dmp 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) Parameters: * **text** *[str]*: Text to search. * **rank** *[str]*: Filter results by rank. * **exact** *[bool]*: Exact or partial name search (both case sensitive). * **force_extended** *[bool]*: Search for text in all categories at once. Returns: list of matching nodes """ n = super().search_name(text, rank=rank, exact=exact) if n and not force_extended: return n else: if exact: ret = self._exact_name(text, self._extended_name_nodes) else: ret = self._partial_name(text, self._extended_name_nodes) # Only return nodes of chosen rank if rank: ret = filter_function(ret, self.rank, rank) return list(set(n + ret)) def stats(self): s = super().stats() if self._merged: s["merged"] = len(self._merged) if self._extended_name_nodes: s["extended_names"] = len(self._extended_name_nodes) return s
View Source
def __init__(self, **kwargs): self._merged = {} self._extended_name_nodes = {} super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
def latest(self, node: str): n = super().latest(node) if n == self.undefined_node: n = self.merged(node) return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
View Source
def merged(self, node: str): """ Returns relative entry from the merged.dmp file of a given node. """ if node in self._merged: return self._merged[node] else: return self.undefined_node
Returns relative entry from the merged.dmp file of a given node.
View Source
def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): """ Search node by exact or partial name. Default order (can be skipped with **force_extended=True**): 1) Search names defined as "scientific name" on nodes.dmp 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) Parameters: * **text** *[str]*: Text to search. * **rank** *[str]*: Filter results by rank. * **exact** *[bool]*: Exact or partial name search (both case sensitive). * **force_extended** *[bool]*: Search for text in all categories at once. Returns: list of matching nodes """ n = super().search_name(text, rank=rank, exact=exact) if n and not force_extended: return n else: if exact: ret = self._exact_name(text, self._extended_name_nodes) else: ret = self._partial_name(text, self._extended_name_nodes) # Only return nodes of chosen rank if rank: ret = filter_function(ret, self.rank, rank) return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search names defined as "scientific name" on nodes.dmp
2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
View Source
def stats(self): s = super().stats() if self._merged: s["merged"] = len(self._merged) if self._extended_name_nodes: s["extended_names"] = len(self._extended_name_nodes) return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
View Source
class OttTx(MultiTax): _default_urls = ["http://files.opentreeoflife.org/ott/ott3.4/ott3.4.tgz"] _default_root_node = "805080" def __init__(self, **kwargs): self._forwards = {} self._extended_name_nodes = {} super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'OttTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return {} def _parse(self, fhs, **kwargs): fhs_list = list(fhs.values()) if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"): nodes, ranks, names = self._parse_ott( fhs_list[0], extended_names=kwargs["extended_names"]) else: # nodes.dmp nodes, ranks, names = self._parse_taxonomy(fhs_list[0]) # [forwards.tsv] if len(fhs) >= 2: self._forwards = self._parse_forwards(fhs_list[1]) if len(fhs) == 3 and kwargs["extended_names"]: self._extended_name_nodes = self._parse_synonyms(fhs_list[2]) return nodes, ranks, names def _parse_forwards(self, fh): forwards = {} # skip first line header next(fh) for line in fh: try: old_taxid, new_taxid = line.rstrip().split('\t') except: old_taxid, new_taxid = line.decode().rstrip().split('\t') forwards[old_taxid] = new_taxid return forwards def _parse_ott(self, fh_taxdump, extended_names): # Get files inside folder by name for e in fh_taxdump.getnames(): if e.endswith("taxonomy.tsv"): tax = e if e.endswith("forwards.tsv"): fwr = e if e.endswith("synonyms.tsv"): syn = e with fh_taxdump.extractfile(tax) as fh_nodes: nodes, ranks, names = self._parse_taxonomy(fh_nodes) with fh_taxdump.extractfile(fwr) as fh_forwards: self._forwards = self._parse_forwards(fh_forwards) if extended_names: with fh_taxdump.extractfile(syn) as fh_synonyms: self._extended_name_nodes = self._parse_synonyms(fh_synonyms) return nodes, ranks, names def _parse_synonyms(self, fh): synonyms = {} # skip first line header next(fh) for line in fh: try: name, taxid, _ = line.split('\t|\t', 2) except: name, taxid, _ = line.decode().split('\t|\t', 2) if name not in synonyms: synonyms[name] = [] synonyms[name].append(taxid) return synonyms def _parse_taxonomy(self, fh): nodes = {} ranks = {} names = {} # skip first line header next(fh) for line in fh: try: taxid, parent_taxid, name, rank, _ = line.split('\t|\t', 4) except: taxid, parent_taxid, name, rank, _ = line.decode().split('\t|\t', 4) ranks[taxid] = rank nodes[taxid] = parent_taxid names[taxid] = name return nodes, ranks, names def forwards(self, node: str): """ Returns relative entry from the forwards.tsv file of a given node. """ if node in self._forwards: return self._forwards[node] else: return self.undefined_node def latest(self, node: str): n = super().latest(node) if n == self.undefined_node: n = self.forwards(node) return n def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): """ Search node by exact or partial name. Default order (can be skipped with **force_extended=True**): 1) Search default names defined on "taxonomy.tsv" 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) Parameters: * **text** *[str]*: Text to search. * **rank** *[str]*: Filter results by rank. * **exact** *[bool]*: Exact or partial name search (both case sensitive). * **force_extended** *[bool]*: Search for text in all categories at once. Returns: list of matching nodes """ n = super().search_name(text, rank=rank, exact=exact) if n and not force_extended: return n else: if exact: ret = self._exact_name(text, self._extended_name_nodes) else: ret = self._partial_name(text, self._extended_name_nodes) # Only return nodes of chosen rank if rank: ret = filter_function(ret, self.rank, rank) return list(set(n + ret)) def stats(self): s = super().stats() if self._forwards: s["forwards"] = len(self._forwards) if self._extended_name_nodes: s["extended_names"] = len(self._extended_name_nodes) return s
View Source
def __init__(self, **kwargs): self._forwards = {} self._extended_name_nodes = {} super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
def forwards(self, node: str): """ Returns relative entry from the forwards.tsv file of a given node. """ if node in self._forwards: return self._forwards[node] else: return self.undefined_node
Returns relative entry from the forwards.tsv file of a given node.
View Source
def latest(self, node: str): n = super().latest(node) if n == self.undefined_node: n = self.forwards(node) return n
Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
View Source
def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): """ Search node by exact or partial name. Default order (can be skipped with **force_extended=True**): 1) Search default names defined on "taxonomy.tsv" 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) Parameters: * **text** *[str]*: Text to search. * **rank** *[str]*: Filter results by rank. * **exact** *[bool]*: Exact or partial name search (both case sensitive). * **force_extended** *[bool]*: Search for text in all categories at once. Returns: list of matching nodes """ n = super().search_name(text, rank=rank, exact=exact) if n and not force_extended: return n else: if exact: ret = self._exact_name(text, self._extended_name_nodes) else: ret = self._partial_name(text, self._extended_name_nodes) # Only return nodes of chosen rank if rank: ret = filter_function(ret, self.rank, rank) return list(set(n + ret))
Search node by exact or partial name.
Default order (can be skipped with force_extended=True):
1) Search default names defined on "taxonomy.tsv"
2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))
Parameters:
- text [str]: Text to search.
- rank [str]: Filter results by rank.
- exact [bool]: Exact or partial name search (both case sensitive).
- force_extended [bool]: Search for text in all categories at once.
Returns: list of matching nodes
View Source
def stats(self): s = super().stats() if self._forwards: s["forwards"] = len(self._forwards) if self._extended_name_nodes: s["extended_names"] = len(self._extended_name_nodes) return s
Returns a dict with general numbers of the taxonomic tree
Example:
from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()
pprint(tax.stats())
{'leaves': 30238,
'names': 42739,
'nodes': 42739,
'ranked_leaves': Counter({'species': 30238}),
'ranked_nodes': Counter({'species': 30238,
'genus': 8778,
'family': 2323,
'order': 930,
'class': 337,
'phylum': 131,
'domain': 1,
'root': 1}),
'ranks': 42739}
View Source
class SilvaTx(MultiTax): _default_urls = [ "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"] def __init__(self, **kwargs): super().__init__(**kwargs) def __repr__(self): stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] return 'SilvaTx({})'.format(', '.join(stats)) def _build_translation(self, target_tax, files: list = None, urls: list = None): warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + "," + target_tax.__class__.__name__ + "] not yet implemented.") return {} def _parse(self, fhs, **kwargs): nodes = {} ranks = {} names = {} lin = {} for source, fh in fhs.items(): for line in fh: try: name_lineage, taxid, rank, _ = line.split('\t', 3) except: name_lineage, taxid, rank, _ = line.decode().split('\t', 3) # Remove last char ";" lineage = name_lineage[:-1] name = lineage.split(";")[-1] # Save lineage to build tree lin[lineage] = taxid names[taxid] = name ranks[taxid] = rank # Build parent node connection for lineage, taxid in lin.items(): t = taxid l = lineage.split(";")[:-1] while l: parent_taxid = lin[";".join(l)] if t not in nodes: nodes[t] = parent_taxid t = parent_taxid del l[-1] # remove last element # Connect last node to root if t not in nodes: nodes[t] = self._default_root_node return nodes, ranks, names
View Source
def __init__(self, **kwargs): super().__init__(**kwargs)
Main constructor of MultiTax and sub-classes
Parameters:
- files [str, list]: One or more local files to parse.
- urls [str, list]: One or more urls to download and parse.
- output_prefix [str]: Directory to write downloaded files.
- root_node [str]: Define an alternative root node.
- root_parent [str]: Define the root parent node identifier.
- root_name [str]: Define an alternative root name. Set to None to use original name.
- root_rank [str]: Define an alternative root rank. Set to None to use original name.
- undefined_node [str]: Define a default return value for undefined nodes.
- undefined_name [str]: Define a default return value for undefined names.
- undefined_rank [str]: Define a default return value for undefined ranks.
- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
- extended_names [bool]: Parse extended names if available.
Example:
tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")