Source code for treemaker.treemaker

#!/usr/bin/env python
#coding=utf-8
"""TreeMaker"""
__author__ = 'Simon J. Greenhill <simon@simon.net.nz>'
__copyright__ = 'Copyright (c) 2018 Simon J. Greenhill'
__license__ = 'New-style BSD'

import os
import re
import sys
import codecs
import argparse
from functools import total_ordering

VERSION = "1.3"

NEXUS_TEMPLATE = """#NEXUS

begin trees;
   tree %(label)s = %(tree)s
end;
"""

BADCHARS = "();"

IS_WHITESPACE = re.compile(r"""\s+""")


[docs]@total_ordering class Tree(object): """ Tree object to represent the classification taxonomy. Args: node (str): Label for this node. children (list): Optional list of children nodes show_nodelabels (boolean): A flag to show nodelabels or not (default=False) """ def __init__(self, node=None, children=None, show_nodelabels=False): self.children = [] self.show_nodelabels = show_nodelabels if node is None: self.node = '' else: self.node = self._sanitise(str(node)) if children is not None: [self.add(child) for child in children] def __lt__(self, other): return self.node < other.node def __eq__(self, other): return self.node == other.node @property def is_tip(self): """Returns True if node is a tip""" return len(self.children) == 0 @property def is_node(self): """Returns True if node is a node (i.e. has children)""" return len(self.children) > 0
[docs] def add(self, node, children=None): """ Adds a node (with optional children) to the tree. Args: node (str): Label for this node. children (list): Optional list of children nodes Returns: treemaker.Tree: the created node matching `label`. """ if not isinstance(node, Tree): node = Tree(node, children, show_nodelabels=self.show_nodelabels) self._sanitise(node.node) self.children.append(node) return node
[docs] def get_or_create(self, label): """ Helper function to get or create a node. Args: label (str): Label for this node. Returns: treemaker.Tree: the found or created node matching `label`. """ found = self.get(label) if found: return found return self.add(label)
[docs] def get(self, label, node=None): """ Searches tree for node `label`. Args: label (str): Label for this node. node (treemaker.Tree): (optional) parent node. Default is current node. Returns: treemaker.Tree: the found or created node matching `label`. """ node = self if node is None else node for child in node.children: if child.node == label: return child found = self.get(label, child) if found: return found return None
[docs] def tips(self, node=None): """ Returns a list of the tips in the tree Args: node (treemaker.Tree): (optional) parent node. Default is current node. Returns: List[treemaker.Tree]: the tip nodes from the given node. """ node = self if node is None else node for child in node.children: if child.is_node: # remove yield from so we can support py2.7 for n in self.tips(child): yield n if child.is_tip: yield child
def _sanitise(self, node): for char in BADCHARS: if char in node: raise ValueError( "Forbidden character '%s' node: %s" % (char, node) ) return node def __repr__(self): return "<Tree: %s>" % self.node def __str__(self): if not self.children: return self.node else: out = ",".join([str(c) for c in sorted(self.children)]) if len(self.children) == 1: return out elif self.show_nodelabels: return "(%s)%s" % (out, self.node) else: return "(%s)" % out
[docs]class TreeMaker(object): def __init__(self, label="root", nodelabels=False): self.tree = Tree(label, show_nodelabels=nodelabels) self._added = set() def _check_taxon(self, taxon): for char in "()": if char in taxon: raise ValueError( "Error: %s is not allowed in taxon names" % char )
[docs] def add(self, leaf, classification): """ Adds `leaf` to the tree in the location specified by `classification` Args: leaf (str): Leaf label classification (str): A classification string of a format handled by `parse_classification`. Returns: treemaker.Tree: the tree with the new node added. Raises: ValueError: If a duplicate leaf label or classification is given. """ self._check_taxon(leaf) if (leaf, classification) in self._added: raise ValueError("Duplicate Taxon/Classification") parent = self.tree for node in self.parse_classification(classification): parent = parent.get_or_create(node) parent.add(leaf) self._added.add((leaf, classification)) return self.tree
[docs] def add_from(self, iterable): """ Adds all entries from an `iterable`. `iterable` should be a list of lists or a list of tuples (etc) with 2 values - the first one the taxon name, the second the classification string, e.g. >>> iterable = [ >>> ['taxon1', 'a, a'], >>> ['taxon2', 'a, b'], >>> ] >>> tree = TreeMaker().add_from(iterable) Args: iterable (iter): an iterable (e.g. a list). Returns: treemaker.Tree: the tree with the new nodes added. Raises: ValueError: If each member of the iterable does not contain two entries (leaf name, and classification). """ for i, row in enumerate(iterable, 1): if len(row) != 2: raise ValueError("entry %d is not a tuple or list" % i) self.add(row[0], row[1]) return self.tree
[docs] def parse_classification(self, classification): """ Parses a classification string into nodes. >>> Tree().parse_classification("Indo-European, Germanic, English") >>> ["Indo-European", "Germanic", "English"] Args: classification (str): a classification string e.g. "clade 1, clade 2, clade 3" Returns: List: a list of the classification nodes. """ # simple for now, but easily subclassed for more complicated schema return [node.strip() for node in classification.strip().split(",")]
[docs] def read(self, filename): """ Reads data from `filename` and constructs a tree. `filename` should be formatted as follows:: Taxon1 FamilyA, GroupA, SubgroupA Taxon2 FamilyA, GroupA, SubgroupB Taxon3 FamilyA, GroupB ... etc Args: filename (str): a filename containing the classification. Returns: treemaker.Tree: a `Tree` with the specified classification. Raises: ValueError: if a line in the file is not able to be parsed. """ with codecs.open(filename, 'r', encoding="utf8") as handle: for i, line in enumerate(handle, 1): line = line.strip() if not line: continue # skip empty lines if not IS_WHITESPACE.findall(line): raise ValueError( "Malformed line %d -- I need one space: %s" % (i, line) ) self.add(*[_.strip() for _ in IS_WHITESPACE.split(line, 1)]) return self.tree
[docs] def write(self, mode="newick"): """ Writes the output form of the tree. Args: mode (str): An output mode. One of: * "nexus" = a nexus file is generated * "newick" = a newick file (bare tree) is generated nodelabels (bool): show nodelabels or not (default False) Returns: str: a string containing the formatted content. Raises: ValueError: if mode is not "nexus" or "newick". """ if mode == 'newick': return "%s;" % str(self.tree) elif mode == 'nexus': return NEXUS_TEMPLATE % { 'label': self.tree.node if self.tree.node else 'tree', 'tree': str(self.tree), } else: raise ValueError( "Unknown output mode. Please use 'nexus' or 'newick'" )
[docs] def write_to_file(self, filename, mode="nexus"): """ Writes the tree to `filename`. Args: mode (str): An output mode. One of: * "nexus" = a nexus file is generated * "newick" = a newick file (bare tree) is generated nodelabels (bool): show nodelabels or not (default False) Returns: None Raises: IOError: if `filename` already exists. ValueError: if mode is not "nexus" or "newick". """ if os.path.isfile(filename): raise IOError("File %s already exists" % filename) if mode == 'nexus': content = self.write(mode="nexus") elif mode == 'newick': content = self.write(mode="newick") else: raise ValueError( "Unknown output mode. Please use 'nexus' or 'newick'" ) with codecs.open(filename, 'w') as handle: handle.write(content)
[docs]def parse_args(args): """ Parses command line arguments Returns a tuple of (inputfile, method, outputfile) """ descr = 'Constructs a tree from a classification table' parser = argparse.ArgumentParser(description=descr) parser.add_argument("input", help="inputfile") parser.add_argument( '-o', "--output", dest='output', default=None, help="output file", action='store' ) parser.add_argument( '-m', "--mode", dest='mode', choices=['nexus', 'newick'], default="newick", help="output mode: nexus or newick", action='store' ) parser.add_argument( '-l', "--labels", dest='nodelabels', default=False, help="show node labels", action='store_true' ) args = parser.parse_args(args) if not os.path.isfile(args.input): raise IOError("File %s does not exist" % args.input) return (args.input, args.mode, args.output, args.nodelabels)
[docs]def main(args=None): # pragma: no cover if args is None: args = sys.argv[1:] infile, mode, outfile, nodelabels = parse_args(args) t = TreeMaker(nodelabels=nodelabels) t.read(infile) if outfile is None: print(t.write(mode=mode)) else: t.write_to_file(outfile, mode=mode)