multitax

View Source
__version__ = "1.3.1"

__all__ = (
    'CustomTx',
    'DummyTx',
    'GreengenesTx',
    'GtdbTx',
    'NcbiTx',
    'OttTx',
    'SilvaTx',
)

from .customtx import CustomTx
from .dummytx import DummyTx
from .greengenestx import GreengenesTx
from .gtdbtx import GtdbTx
from .ncbitx import NcbiTx
from .otttx import OttTx
from .silvatx import SilvaTx
View Source
class CustomTx(MultiTax):

    _required_cols = ["node", "parent"]
    _possible_cols = ["node", "parent", "rank", "name"]

    def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs):
        """
        CustomTx()

        Parameters:
        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
        * **sep** *[str]*: Separator of fields
        * **\*\*kwargs** defined at `multitax.multitax.MultiTax`

        Example:

            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
        """

        self._cols = self._parse_cols(cols)
        self._sep = sep
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'CustomTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                      "," + target_tax.__class__.__name__ + "] not yet implemented.")
        return {}

    def _parse(self, fhs, **kwargs):
        nodes = {}
        ranks = {}
        names = {}
        for source, fh in fhs.items():
            for line in fh:
                try:
                    fields = line.rstrip().split(self._sep)
                except:
                    fields = line.decode().rstrip().split(self._sep)

                node = fields[self._cols["node"]]
                nodes[node] = fields[self._cols["parent"]]
                if "name" in self._cols:
                    names[node] = fields[self._cols["name"]]
                if "rank" in self._cols:
                    ranks[node] = fields[self._cols["rank"]]

        return nodes, ranks, names

    def _parse_cols(self, cols):
        if isinstance(cols, list):
            cols = {c: i for i, c in enumerate(cols)}

        for rc in self._required_cols:
            if rc not in cols:
                raise ValueError(rc + " is a required column")

        for c in cols:
            if c not in self._possible_cols:
                raise ValueError(c + " is not a valid column: " +
                                 ",".join(self._possible_cols))

        return cols
#   CustomTx( cols: list = ['node', 'parent', 'rank', 'name'], sep: str = '\t', **kwargs )
View Source
    def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs):
        """
        CustomTx()

        Parameters:
        * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
        * **sep** *[str]*: Separator of fields
        * **\*\*kwargs** defined at `multitax.multitax.MultiTax`

        Example:

            tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
            tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
        """

        self._cols = self._parse_cols(cols)
        self._sep = sep
        super().__init__(**kwargs)

CustomTx()

Parameters:

  • cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
  • sep [str]: Separator of fields
  • **kwargs defined at multitax.multitax.MultiTax

Example:

tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
View Source
class DummyTx(MultiTax):

    def __init__(self, **kwargs):
        """
        DummyTx() - Dummy empty taxonomy

        Parameters:

        * \*\*kwargs defined at `multitax.multitax.MultiTax`
        """
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'DummyTx({})'.format(', '.join(stats))
#   DummyTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        """
        DummyTx() - Dummy empty taxonomy

        Parameters:

        * \*\*kwargs defined at `multitax.multitax.MultiTax`
        """
        super().__init__(**kwargs)

DummyTx() - Dummy empty taxonomy

Parameters:

#   class GreengenesTx(multitax.multitax.MultiTax):
View Source
class GreengenesTx(MultiTax):
    _default_urls = [
        "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_13_5/gg_13_5_taxonomy.txt.gz"]
    _rank_codes = [("k__", "kingdom"),
                   ("p__", "phylum"),
                   ("c__", "class"),
                   ("o__", "order"),
                   ("f__", "family"),
                   ("g__", "genus"),
                   ("s__", "species")]

    def __init__(self, **kwargs):
        # forwards.tsv
        self._forwards = {}
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'GreengenesTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                      "," + target_tax.__class__.__name__ + "] not yet implemented.")
        return {}

    def _parse(self, fhs, **kwargs):
        nodes = {}
        ranks = {}
        names = {}

        for source, fh in fhs.items():
            for line in fh:
                try:
                    _, lineage = line.rstrip().split('\t')
                except:
                    _, lineage = line.decode().rstrip().split('\t')
                lin = lineage.split("; ")
                for i in range(len(lin))[::-1]:
                    # assert rank
                    assert lin[i][:3] == self._rank_codes[i][0]
                    # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
                    taxid = lin[i]
                    name = lin[i][3:]
                    if not name:
                        continue  # empty entry "s__"
                    rank = self._rank_codes[i][1]
                    if i == 0:
                        parent_taxid = self._default_root_node
                    else:
                        parent_taxid = lin[i-1]
                    if taxid not in nodes:
                        nodes[taxid] = parent_taxid
                        names[taxid] = name
                        ranks[taxid] = rank

        return nodes, ranks, names
#   GreengenesTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        # forwards.tsv
        self._forwards = {}
        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
class GtdbTx(MultiTax):

    _default_urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz",
                     "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz"]
    _rank_codes = [("d__", "domain"),
                   ("p__", "phylum"),
                   ("c__", "class"),
                   ("o__", "order"),
                   ("f__", "family"),
                   ("g__", "genus"),
                   ("s__", "species")]

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'GtdbTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        translated_nodes = {}
        if target_tax.__class__.__name__ == "NcbiTx":

            if files:
                fhs = open_files(files)
            else:
                _urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz",
                         "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz"]
                fhs = download_files(
                    urls=urls if urls else _urls, retry_attempts=3)

            for source, fh in fhs.items():
                for file in fh.getmembers():
                    with fh.extractfile(file) as ext:
                        for line in ext:
                            try:
                                fields = line.rstrip().split('\t')
                            except:
                                fields = line.decode().rstrip().split('\t')

                            # skip header
                            if fields[0] == "accession":
                                continue

                            # 0 accession
                            # 16 gtdb_taxonomy
                            # 77 ncbi_taxid
                            # 78 ncbi_taxonomy
                            # 79 ncbi_taxonomy_unfiltered

                            # Create lineage from leaf node based on target tax (instead of using field 78)
                            # to accomodate changes in newest versions of the NCBI tax
                            ncbi_leaf_node = target_tax.latest(fields[77])
                            if ncbi_leaf_node != target_tax.undefined_node:
                                ncbi_nodes = target_tax.lineage(ncbi_leaf_node, ranks=[
                                                                "superkingdom", "phylum", "class",
                                                                "order", "family", "genus", "species"])
                            else:
                                continue

                            # Build GTDB lineage from leaf (species on given lineage)
                            # to accomodate possible changes in the loaded tax
                            gtdb_leaf_node = fields[16].split(";")[-1]
                            if gtdb_leaf_node != self.undefined_node:
                                gtdb_nodes = self.lineage(gtdb_leaf_node, ranks=[
                                    "domain", "phylum", "class", "order",
                                    "family", "genus", "species"])
                            else:
                                continue

                            # Match ranks
                            for i, gtdb_n in enumerate(gtdb_nodes):
                                if ncbi_nodes[i] != target_tax.undefined_node and gtdb_n != self.undefined_node:
                                    if gtdb_n not in translated_nodes:
                                        translated_nodes[gtdb_n] = set()
                                    translated_nodes[gtdb_n].add(ncbi_nodes[i])

        else:
            warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                          "," + target_tax.__class__.__name__ + "] not yet implemented.")

        return translated_nodes

    def _parse(self, fhs, **kwargs):
        nodes = {}
        ranks = {}
        names = {}
        for source, fh in fhs.items():
            for line in fh:
                try:
                    _, lineage = line.rstrip().split('\t')
                except:
                    _, lineage = line.decode().rstrip().split('\t')
                lin = lineage.split(";")
                for i in range(len(lin))[::-1]:
                    # assert rank
                    assert lin[i][:3] == self._rank_codes[i][0]
                    # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
                    taxid = lin[i]
                    name = lin[i][3:]
                    # empty entry "s__"
                    if not name:
                        continue
                    rank = self._rank_codes[i][1]
                    if i == 0:
                        parent_taxid = self._default_root_node
                    else:
                        parent_taxid = lin[i-1]
                    if taxid not in nodes:
                        nodes[taxid] = parent_taxid
                        names[taxid] = name
                        ranks[taxid] = rank

        return nodes, ranks, names
#   GtdbTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
View Source
class NcbiTx(MultiTax):
    _default_urls = ["ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"]

    def __init__(self, **kwargs):
        self._merged = {}
        self._extended_name_nodes = {}
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'NcbiTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        translated_nodes = {}
        if target_tax.__class__.__name__ == "GtdbTx":

            if files:
                fhs = open_files(files)
            else:
                _urls = ["https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz",
                         "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz"]
                fhs = download_files(
                    urls=urls if urls else _urls, retry_attempts=3)

            for source, fh in fhs.items():
                for file in fh.getmembers():
                    with fh.extractfile(file) as ext:
                        for line in ext:
                            try:
                                fields = line.rstrip().split('\t')
                            except:
                                fields = line.decode().rstrip().split('\t')

                            # skip header
                            if fields[0] == "accession":
                                continue

                            # 0 accession
                            # 16 gtdb_taxonomy
                            # 77 ncbi_taxid
                            # 78 ncbi_taxonomy
                            # 79 ncbi_taxonomy_unfiltered
                            # print(fields)

                            # Build GTDB lineage from leaf (species on given lineage)
                            # to accomodate possible changes in the loaded tax
                            gtdb_leaf_node = fields[16].split(";")[-1]
                            if gtdb_leaf_node != target_tax.undefined_node:
                                gtdb_nodes = target_tax.lineage(gtdb_leaf_node, ranks=[
                                                                "domain", "phylum", "class", "order",
                                                                "family", "genus", "species"])
                            else:
                                continue

                            # Build NCBI lineage from leaf
                            ncbi_leaf_node = self.latest(fields[77])
                            if ncbi_leaf_node != self.undefined_node:
                                # Additional add connection from leaf to species on GTDB
                                # that could represent strain, etc on NCBI tax
                                if ncbi_leaf_node not in translated_nodes:
                                    translated_nodes[ncbi_leaf_node] = set()
                                translated_nodes[ncbi_leaf_node].add(
                                    gtdb_leaf_node)
                                ncbi_nodes = self.lineage(ncbi_leaf_node, ranks=[
                                                          "superkingdom", "phylum", "class", "order",
                                                          "family", "genus", "species"])
                            else:
                                continue

                            # Match ranks
                            for i, ncbi_n in enumerate(ncbi_nodes):
                                if gtdb_nodes[i] != target_tax.undefined_node and ncbi_n != self.undefined_node:
                                    if ncbi_n not in translated_nodes:
                                        translated_nodes[ncbi_n] = set()
                                    translated_nodes[ncbi_n].add(gtdb_nodes[i])

        else:
            warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                          "," + target_tax.__class__.__name__ + "] not yet implemented.")

        return translated_nodes

    def _parse(self, fhs, **kwargs):
        fhs_list = list(fhs.values())
        # One element tar.gz -> taxdump.tar.gz
        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
            nodes, ranks, names, self._merged = self._parse_taxdump(
                fhs_list[0], extended_names=kwargs["extended_names"])
        else:
            # nodes.dmp
            nodes, ranks = self._parse_nodes(fhs_list[0])

            # [names.dmp]
            if len(fhs) >= 2:
                names = self._parse_names(
                    fhs_list[1], extended_names=kwargs["extended_names"])
            else:
                names = {}

            # [merged.dmp]
            if len(fhs) == 3:
                self._merged = self._parse_merged(fhs_list[2])
        return nodes, ranks, names

    def _parse_merged(self, fh):
        merged = {}
        for line in fh:
            try:
                old_taxid, _, new_taxid, _ = line.split('\t', 3)
            except:
                old_taxid, _, new_taxid, _ = line.decode().split('\t', 3)
            merged[old_taxid] = new_taxid
        return merged

    def _parse_names(self, fh, extended_names):
        names = {}
        for line in fh:
            try:
                node, name, _, name_class = line.split('\t|\t')
            except:
                node, name, _, name_class = line.decode().split('\t|\t')
            if name_class.replace('\t|\n', '') == "scientific name":
                names[node] = name
            elif extended_names:
                if name not in self._extended_name_nodes:
                    self._extended_name_nodes[name] = []
                self._extended_name_nodes[name].append(node)

        return names

    def _parse_nodes(self, fh):
        nodes = {}
        ranks = {}
        for line in fh:
            try:
                taxid, parent_taxid, rank, _ = line.split('\t|\t', 3)
            except:
                taxid, parent_taxid, rank, _ = line.decode().split('\t|\t', 3)
            ranks[taxid] = rank
            nodes[taxid] = parent_taxid
        return nodes, ranks

    def _parse_taxdump(self, fh_taxdump, extended_names):
        with fh_taxdump.extractfile('nodes.dmp') as fh_nodes:
            nodes, ranks = self._parse_nodes(fh_nodes)
        with fh_taxdump.extractfile('names.dmp') as fh_names:
            names = self._parse_names(fh_names, extended_names=extended_names)
        with fh_taxdump.extractfile('merged.dmp') as fh_merged:
            merged = self._parse_merged(fh_merged)
        return nodes, ranks, names, merged

    def latest(self, node: str):
        n = super().latest(node)
        if n == self.undefined_node:
            n = self.merged(node)
        return n

    def merged(self, node: str):
        """
        Returns relative entry from the merged.dmp file of a given node.
        """
        if node in self._merged:
            return self._merged[node]
        else:
            return self.undefined_node

    def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
        """
        Search node by exact or partial name.

        Default order (can be skipped with **force_extended=True**):

        1) Search names defined as "scientific name" on nodes.dmp

        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))

        Parameters:
        * **text** *[str]*: Text to search.
        * **rank** *[str]*: Filter results by rank.
        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
        * **force_extended** *[bool]*: Search for text in all categories at once.

        Returns: list of matching nodes
        """
        n = super().search_name(text, rank=rank, exact=exact)
        if n and not force_extended:
            return n
        else:
            if exact:
                ret = self._exact_name(text, self._extended_name_nodes)
            else:
                ret = self._partial_name(text, self._extended_name_nodes)

            # Only return nodes of chosen rank
            if rank:
                ret = filter_function(ret, self.rank, rank)

            return list(set(n + ret))

    def stats(self):
        s = super().stats()
        if self._merged:
            s["merged"] = len(self._merged)
        if self._extended_name_nodes:
            s["extended_names"] = len(self._extended_name_nodes)
        return s
#   NcbiTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        self._merged = {}
        self._extended_name_nodes = {}
        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
#   def latest(self, node: str):
View Source
    def latest(self, node: str):
        n = super().latest(node)
        if n == self.undefined_node:
            n = self.merged(node)
        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

#   def merged(self, node: str):
View Source
    def merged(self, node: str):
        """
        Returns relative entry from the merged.dmp file of a given node.
        """
        if node in self._merged:
            return self._merged[node]
        else:
            return self.undefined_node

Returns relative entry from the merged.dmp file of a given node.

#   def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False ):
View Source
    def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
        """
        Search node by exact or partial name.

        Default order (can be skipped with **force_extended=True**):

        1) Search names defined as "scientific name" on nodes.dmp

        2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))

        Parameters:
        * **text** *[str]*: Text to search.
        * **rank** *[str]*: Filter results by rank.
        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
        * **force_extended** *[bool]*: Search for text in all categories at once.

        Returns: list of matching nodes
        """
        n = super().search_name(text, rank=rank, exact=exact)
        if n and not force_extended:
            return n
        else:
            if exact:
                ret = self._exact_name(text, self._extended_name_nodes)
            else:
                ret = self._partial_name(text, self._extended_name_nodes)

            # Only return nodes of chosen rank
            if rank:
                ret = filter_function(ret, self.rank, rank)

            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search names defined as "scientific name" on nodes.dmp

2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

#   def stats(self):
View Source
    def stats(self):
        s = super().stats()
        if self._merged:
            s["merged"] = len(self._merged)
        if self._extended_name_nodes:
            s["extended_names"] = len(self._extended_name_nodes)
        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
View Source
class OttTx(MultiTax):
    _default_urls = ["http://files.opentreeoflife.org/ott/ott3.4/ott3.4.tgz"]
    _default_root_node = "805080"

    def __init__(self, **kwargs):
        self._forwards = {}
        self._extended_name_nodes = {}
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'OttTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                      "," + target_tax.__class__.__name__ + "] not yet implemented.")
        return {}

    def _parse(self, fhs, **kwargs):
        fhs_list = list(fhs.values())
        if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
            nodes, ranks, names = self._parse_ott(
                fhs_list[0], extended_names=kwargs["extended_names"])
        else:
            # nodes.dmp
            nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
            # [forwards.tsv]
            if len(fhs) >= 2:
                self._forwards = self._parse_forwards(fhs_list[1])
            if len(fhs) == 3 and kwargs["extended_names"]:
                self._extended_name_nodes = self._parse_synonyms(fhs_list[2])

        return nodes, ranks, names

    def _parse_forwards(self, fh):
        forwards = {}
        # skip first line header
        next(fh)
        for line in fh:
            try:
                old_taxid, new_taxid = line.rstrip().split('\t')
            except:
                old_taxid, new_taxid = line.decode().rstrip().split('\t')
            forwards[old_taxid] = new_taxid
        return forwards

    def _parse_ott(self, fh_taxdump, extended_names):
        # Get files inside folder by name
        for e in fh_taxdump.getnames():
            if e.endswith("taxonomy.tsv"):
                tax = e
            if e.endswith("forwards.tsv"):
                fwr = e
            if e.endswith("synonyms.tsv"):
                syn = e

        with fh_taxdump.extractfile(tax) as fh_nodes:
            nodes, ranks, names = self._parse_taxonomy(fh_nodes)
        with fh_taxdump.extractfile(fwr) as fh_forwards:
            self._forwards = self._parse_forwards(fh_forwards)
        if extended_names:
            with fh_taxdump.extractfile(syn) as fh_synonyms:
                self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
        return nodes, ranks, names

    def _parse_synonyms(self, fh):
        synonyms = {}
        # skip first line header
        next(fh)
        for line in fh:
            try:
                name, taxid, _ = line.split('\t|\t', 2)
            except:
                name, taxid, _ = line.decode().split('\t|\t', 2)
            if name not in synonyms:
                synonyms[name] = []
            synonyms[name].append(taxid)

        return synonyms

    def _parse_taxonomy(self, fh):
        nodes = {}
        ranks = {}
        names = {}
        # skip first line header
        next(fh)
        for line in fh:
            try:
                taxid, parent_taxid, name, rank, _ = line.split('\t|\t', 4)
            except:
                taxid, parent_taxid, name, rank, _ = line.decode().split('\t|\t', 4)
            ranks[taxid] = rank
            nodes[taxid] = parent_taxid
            names[taxid] = name
        return nodes, ranks, names

    def forwards(self, node: str):
        """
        Returns relative entry from the forwards.tsv file of a given node.
        """
        if node in self._forwards:
            return self._forwards[node]
        else:
            return self.undefined_node

    def latest(self, node: str):
        n = super().latest(node)
        if n == self.undefined_node:
            n = self.forwards(node)
        return n

    def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
        """
        Search node by exact or partial name.

        Default order (can be skipped with **force_extended=True**):

        1) Search default names defined on "taxonomy.tsv"

        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))

        Parameters:
        * **text** *[str]*: Text to search.
        * **rank** *[str]*: Filter results by rank.
        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
        * **force_extended** *[bool]*: Search for text in all categories at once.

        Returns: list of matching nodes
        """
        n = super().search_name(text, rank=rank, exact=exact)
        if n and not force_extended:
            return n
        else:
            if exact:
                ret = self._exact_name(text, self._extended_name_nodes)
            else:
                ret = self._partial_name(text, self._extended_name_nodes)

            # Only return nodes of chosen rank
            if rank:
                ret = filter_function(ret, self.rank, rank)

            return list(set(n + ret))

    def stats(self):
        s = super().stats()
        if self._forwards:
            s["forwards"] = len(self._forwards)
        if self._extended_name_nodes:
            s["extended_names"] = len(self._extended_name_nodes)
        return s
#   OttTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        self._forwards = {}
        self._extended_name_nodes = {}
        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
#   def forwards(self, node: str):
View Source
    def forwards(self, node: str):
        """
        Returns relative entry from the forwards.tsv file of a given node.
        """
        if node in self._forwards:
            return self._forwards[node]
        else:
            return self.undefined_node

Returns relative entry from the forwards.tsv file of a given node.

#   def latest(self, node: str):
View Source
    def latest(self, node: str):
        n = super().latest(node)
        if n == self.undefined_node:
            n = self.forwards(node)
        return n

Returns latest/updated version of a given node. If node is already the latests, returns itself. Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

#   def search_name( self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False ):
View Source
    def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
        """
        Search node by exact or partial name.

        Default order (can be skipped with **force_extended=True**):

        1) Search default names defined on "taxonomy.tsv"

        2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))

        Parameters:
        * **text** *[str]*: Text to search.
        * **rank** *[str]*: Filter results by rank.
        * **exact** *[bool]*: Exact or partial name search (both case sensitive).
        * **force_extended** *[bool]*: Search for text in all categories at once.

        Returns: list of matching nodes
        """
        n = super().search_name(text, rank=rank, exact=exact)
        if n and not force_extended:
            return n
        else:
            if exact:
                ret = self._exact_name(text, self._extended_name_nodes)
            else:
                ret = self._partial_name(text, self._extended_name_nodes)

            # Only return nodes of chosen rank
            if rank:
                ret = filter_function(ret, self.rank, rank)

            return list(set(n + ret))

Search node by exact or partial name.

Default order (can be skipped with force_extended=True):

1) Search default names defined on "taxonomy.tsv"

2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(extended_names=True))

Parameters:

  • text [str]: Text to search.
  • rank [str]: Filter results by rank.
  • exact [bool]: Exact or partial name search (both case sensitive).
  • force_extended [bool]: Search for text in all categories at once.

Returns: list of matching nodes

#   def stats(self):
View Source
    def stats(self):
        s = super().stats()
        if self._forwards:
            s["forwards"] = len(self._forwards)
        if self._extended_name_nodes:
            s["extended_names"] = len(self._extended_name_nodes)
        return s

Returns a dict with general numbers of the taxonomic tree

Example:

from pprint import pprint
from multitax import GtdbTx
tax = GtdbTx()

pprint(tax.stats())
{'leaves': 30238,
 'names': 42739,
 'nodes': 42739,
 'ranked_leaves': Counter({'species': 30238}),
 'ranked_nodes': Counter({'species': 30238,
                          'genus': 8778,
                          'family': 2323,
                          'order': 930,
                          'class': 337,
                          'phylum': 131,
                          'domain': 1,
                          'root': 1}),
 'ranks': 42739}
View Source
class SilvaTx(MultiTax):
    _default_urls = [
        "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"]

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def __repr__(self):
        stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
        return 'SilvaTx({})'.format(', '.join(stats))

    def _build_translation(self, target_tax, files: list = None, urls: list = None):
        warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
                      "," + target_tax.__class__.__name__ + "] not yet implemented.")
        return {}

    def _parse(self, fhs, **kwargs):
        nodes = {}
        ranks = {}
        names = {}

        lin = {}
        for source, fh in fhs.items():
            for line in fh:
                try:
                    name_lineage, taxid, rank, _ = line.split('\t', 3)
                except:
                    name_lineage, taxid, rank, _ = line.decode().split('\t', 3)
                # Remove last char ";"
                lineage = name_lineage[:-1]
                name = lineage.split(";")[-1]
                # Save lineage to build tree
                lin[lineage] = taxid
                names[taxid] = name
                ranks[taxid] = rank

        # Build parent node connection
        for lineage, taxid in lin.items():
            t = taxid
            l = lineage.split(";")[:-1]
            while l:
                parent_taxid = lin[";".join(l)]
                if t not in nodes:
                    nodes[t] = parent_taxid
                t = parent_taxid
                del l[-1]  # remove last element
            # Connect last node to root
            if t not in nodes:
                nodes[t] = self._default_root_node

        return nodes, ranks, names
#   SilvaTx(**kwargs)
View Source
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

Main constructor of MultiTax and sub-classes

Parameters:

  • files [str, list]: One or more local files to parse.
  • urls [str, list]: One or more urls to download and parse.
  • output_prefix [str]: Directory to write downloaded files.
  • root_node [str]: Define an alternative root node.
  • root_parent [str]: Define the root parent node identifier.
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • undefined_node [str]: Define a default return value for undefined nodes.
  • undefined_name [str]: Define a default return value for undefined names.
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • extended_names [bool]: Parse extended names if available.

Example:

tax_ncbi = NcbiTx()
tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
tax_ott = OttTx(root_node="844192")
tax_gg = GreengenesTx(output_prefix="save/to/prefix_")