Source code for pds4_tools.reader.read_label

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError

from ..utils import compat
from ..utils.constants import PDS4_NAMESPACES
from ..utils.logging import logger_init

from ..extern import six

# Initialize the logger
logger = logger_init()


[docs]def read_label(filename, strip_extra_whitespace=True, enforce_default_prefixes=False, include_namespace_map=False, decode_py2=False): """ Reads a PDS4 XML Label into an ``ElementTree`` Element object. Parameters ---------- filename : str or unicode The filename, including the path, of the XML label. strip_extra_whitespace : bool, optional If True, then for element text and attribute values, it collapses contiguous whitespaces (including space, tab and newlines) into a single space, and removes leading and trailing whitespace altogether. However, this only done if the value has a single line with non-whitespace characters. Defaults to False. enforce_default_prefixes : bool, optional If True, strips the default namespace, and ensures that default PDS4 prefixes are used for known namespaces (PDS4_NAMESPACES). Defaults to False. include_namespace_map : bool, optional If True, changes method return to a tuple, where the first value is the label ElementTree object as usual and the second is a ``dict`` with keys being the namespace URIs and values being the namespace prefixes in this label. Defaults to False. decode_py2 : bool, optional If True, decodes UTF-8 byte strings (``str``) into ``unicode`` strings in Python 2. Option is ignored in Python 3. Defaults to False. Returns ------- ``ElementTree`` Element Root element for the read-in PDS4 label """ # Read-in XML tree try: xml_tree = ET.iterparse(filename, events=('start-ns', 'end')) except IOError: raise IOError('Unable to locate or read label file: ' + filename) # Adjust XML tree try: namespace_map = {} for event, elem in xml_tree: # Add namespace to the namespace map if event == 'start-ns': if enforce_default_prefixes: for prefix, uri in six.iteritems(PDS4_NAMESPACES): # Ensure the PDS4 namespace is the default prefix if elem[1] == PDS4_NAMESPACES['pds']: elem = ('', elem[1]) # Ensure that dictionaries which are referred to in code # by prefix (such as disp and sp) have the expected prefix elif uri == elem[1]: elem = (prefix, elem[1]) # Add namespace to map (different prefixes for an existing namespace URI are skipped) # Technical note: this map is stored dict[URI] = prefix for two reasons: # (1) ElementTree itself internally stores the namespace map like this, despite taking # it in opposite relation from user # (2) It seems ever slightly more legitimate to remember a single prefix to referring to # multiple URI (e.g. via local prefixes) than to remember two prefixes referring to # the same URI. # These are not necessarily good reasons. if elem[1] not in namespace_map: namespace_map[elem[1]] = elem[0] continue # Strip PDS4 namespace tag (a continuation of ensuring default prefix is PDS4 namespace) if (enforce_default_prefixes) and (PDS4_NAMESPACES['pds'] in elem.tag): elem.tag = elem.tag.split('{', 1)[0] + elem.tag.split('}', 1)[1] # Strip whitespace in elements and attributes if requested if strip_extra_whitespace: subiter = compat.ET_Tree_iter(ET.ElementTree(elem)) attribs = six.iteritems(elem.attrib) # Strip whitespaces at beginning and end of value in elements that do not have children if len(elem) == 0: for elem_content in subiter: if (elem_content.text) and (_non_blank_line_count(elem_content.text) == 1): elem_content.text = _normalize(elem_content.text) # Strip whitespaces at beginning and end of attribute values for name, value in attribs: if _non_blank_line_count(value) == 1: elem.attrib[name] = _normalize(value) label_xml_root = xml_tree.root # For Python 2, we can decode all ``str`` to ``unicode``, such that all meta data strings # are consistently unicode. if six.PY2 and decode_py2: label_xml_root = _decode_tree(label_xml_root) # Raise exception if XML cannot be parsed. In Python 3 we raise from None to avoid confusing re-raise except (ExpatError, compat.ET_ParseError): six.raise_from( ExpatError('The requested PDS4 label file does not appear contain valid XML: ' + filename), None) if include_namespace_map: return label_xml_root, namespace_map else: return label_xml_root
def _decode_tree(xml_tree): """ Decode an XML tree from UTF-8 encoded ``str`` to ``unicode``. Decodes all element tags and text, as well as attribute names and values. Do not call gratuitously due to efficiency concerns. Notes ----- This function is intended to be used solely in Python 2. Python 3 has no ``unicode`` data type, all ``str`` are essentially ``unicode`` by default. Parameters ---------- xml_tree : ``ElementTree`` Element The XML tree to decoded. Returns ------- ``ElementTree`` Element The decoded XML tree, with all strings converted to ``unicode`` from UTF-8 ``str``. """ # This function is designed to work solely in Python 2; otherwise we return the tree unchanged. if not six.PY2: return xml_tree # Function that decodes all passed in text to unicode, assuming it's encoded as UTF-8 def decode(text): if text is None: return None if isinstance(text, str): return text.decode('utf-8') return text # Loop over all elements in the tree for elem in compat.ET_Element_iter(xml_tree): # Decode elements elem.tag = decode(elem.tag) elem.text = decode(elem.text) elem.tail = decode(elem.tail) # Decode attributes for name, value in elem.attrib.items(): del elem.attrib[name] name = decode(name) value = decode(value) elem.attrib[name] = value return xml_tree def _non_blank_line_count(string): """ Parameters ---------- string : str or unicode String (potentially multi-line) to search in. Returns ------- int Number of non-blank lines in string. """ non_blank_counter = 0 for line in string.splitlines(): if line.strip(): non_blank_counter += 1 return non_blank_counter def _normalize(string): """ Normalize whitepace in a string according to PDS4 Standards. Notes ----- There are a number of ways to implement this method. The employed implementation is generally either the fastest or close to the fastest between the various platforms. Parameters ---------- string : str or unicode String to normalize. Returns ------- str or unicode Whitespace-collapsed string. Collapses contiguous spaces (including line feeds, carriage returns, tabs) into a single space character, and removes leading and trailing spaces entirely. white space collapsed """ return ' '.join(string.split())