Source code for pds4_tools.reader.read_label
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
from ..utils import compat
from ..utils.constants import PDS4_NAMESPACES
from ..utils.logging import logger_init
from ..extern import six
# Initialize the logger
logger = logger_init()
#################################
[docs]def read_label(filename, strip_extra_whitespace=True, enforce_default_prefixes=False,
include_namespace_map=False, decode_py2=False):
""" Reads a PDS4 XML Label into an ``ElementTree`` Element object.
Parameters
----------
filename : str or unicode
The filename, including the path, of the XML label.
strip_extra_whitespace : bool, optional
If True, then for element text and attribute values, it collapses
contiguous whitespaces (including space, tab and newlines) into a
single space, and removes leading and trailing whitespace altogether.
However, this only done if the value has a single line with
non-whitespace characters. Defaults to False.
enforce_default_prefixes : bool, optional
If True, strips the default namespace, and ensures that
default PDS4 prefixes are used for known namespaces (PDS4_NAMESPACES).
Defaults to False.
include_namespace_map : bool, optional
If True, changes method return to a tuple, where the first
value is the label ElementTree object as usual and the second
is a ``dict`` with keys being the namespace URIs and values being
the namespace prefixes in this label. Defaults to False.
decode_py2 : bool, optional
If True, decodes UTF-8 byte strings (``str``) into ``unicode``
strings in Python 2. Option is ignored in Python 3. Defaults to False.
Returns
-------
``ElementTree`` Element
Root element for the read-in PDS4 label
"""
# Read-in XML tree
try:
xml_tree = ET.iterparse(filename, events=('start-ns', 'end'))
except IOError:
raise IOError('Unable to locate or read label file: ' + filename)
# Adjust XML tree
try:
namespace_map = {}
for event, elem in xml_tree:
# Add namespace to the namespace map
if event == 'start-ns':
if enforce_default_prefixes:
for prefix, uri in six.iteritems(PDS4_NAMESPACES):
# Ensure the PDS4 namespace is the default prefix
if elem[1] == PDS4_NAMESPACES['pds']:
elem = ('', elem[1])
# Ensure that dictionaries which are referred to in code
# by prefix (such as disp and sp) have the expected prefix
elif uri == elem[1]:
elem = (prefix, elem[1])
# Add namespace to map (different prefixes for an existing namespace URI are skipped)
# Technical note: this map is stored dict[URI] = prefix for two reasons:
# (1) ElementTree itself internally stores the namespace map like this, despite taking
# it in opposite relation from user
# (2) It seems ever slightly more legitimate to remember a single prefix to referring to
# multiple URI (e.g. via local prefixes) than to remember two prefixes referring to
# the same URI.
# These are not necessarily good reasons.
if elem[1] not in namespace_map:
namespace_map[elem[1]] = elem[0]
continue
# Strip PDS4 namespace tag (a continuation of ensuring default prefix is PDS4 namespace)
if (enforce_default_prefixes) and (PDS4_NAMESPACES['pds'] in elem.tag):
elem.tag = elem.tag.split('{', 1)[0] + elem.tag.split('}', 1)[1]
# Strip whitespace in elements and attributes if requested
if strip_extra_whitespace:
subiter = compat.ET_Tree_iter(ET.ElementTree(elem))
attribs = six.iteritems(elem.attrib)
# Strip whitespaces at beginning and end of value in elements that do not have children
if len(elem) == 0:
for elem_content in subiter:
if (elem_content.text) and (_non_blank_line_count(elem_content.text) == 1):
elem_content.text = _normalize(elem_content.text)
# Strip whitespaces at beginning and end of attribute values
for name, value in attribs:
if _non_blank_line_count(value) == 1:
elem.attrib[name] = _normalize(value)
label_xml_root = xml_tree.root
# For Python 2, we can decode all ``str`` to ``unicode``, such that all meta data strings
# are consistently unicode.
if six.PY2 and decode_py2:
label_xml_root = _decode_tree(label_xml_root)
# Raise exception if XML cannot be parsed. In Python 3 we raise from None to avoid confusing re-raise
except (ExpatError, compat.ET_ParseError):
six.raise_from(
ExpatError('The requested PDS4 label file does not appear contain valid XML: ' + filename), None)
if include_namespace_map:
return label_xml_root, namespace_map
else:
return label_xml_root
def _decode_tree(xml_tree):
""" Decode an XML tree from UTF-8 encoded ``str`` to ``unicode``.
Decodes all element tags and text, as well as attribute names and values. Do not call gratuitously
due to efficiency concerns.
Notes
-----
This function is intended to be used solely in Python 2. Python 3 has no ``unicode`` data type,
all ``str`` are essentially ``unicode`` by default.
Parameters
----------
xml_tree : ``ElementTree`` Element
The XML tree to decoded.
Returns
-------
``ElementTree`` Element
The decoded XML tree, with all strings converted to ``unicode`` from UTF-8 ``str``.
"""
# This function is designed to work solely in Python 2; otherwise we return the tree unchanged.
if not six.PY2:
return xml_tree
# Function that decodes all passed in text to unicode, assuming it's encoded as UTF-8
def decode(text):
if text is None:
return None
if isinstance(text, str):
return text.decode('utf-8')
return text
# Loop over all elements in the tree
for elem in compat.ET_Element_iter(xml_tree):
# Decode elements
elem.tag = decode(elem.tag)
elem.text = decode(elem.text)
elem.tail = decode(elem.tail)
# Decode attributes
for name, value in elem.attrib.items():
del elem.attrib[name]
name = decode(name)
value = decode(value)
elem.attrib[name] = value
return xml_tree
def _non_blank_line_count(string):
"""
Parameters
----------
string : str or unicode
String (potentially multi-line) to search in.
Returns
-------
int
Number of non-blank lines in string.
"""
non_blank_counter = 0
for line in string.splitlines():
if line.strip():
non_blank_counter += 1
return non_blank_counter
def _normalize(string):
""" Normalize whitepace in a string according to PDS4 Standards.
Notes
-----
There are a number of ways to implement this method. The employed implementation is generally
either the fastest or close to the fastest between the various platforms.
Parameters
----------
string : str or unicode
String to normalize.
Returns
-------
str or unicode
Whitespace-collapsed string. Collapses contiguous spaces (including line feeds, carriage returns,
tabs) into a single space character, and removes leading and trailing spaces entirely.
white space collapsed
"""
return ' '.join(string.split())