Source code for habitat.parser

# Copyright 2010, 2011, 2012, 2013 (C) Adam Greig, Daniel Richman
#
# This file is part of habitat.
#
# habitat is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# habitat is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with habitat.  If not, see <http://www.gnu.org/licenses/>.

"""
Interpret incoming telemetry strings into useful telemetry data.
"""

import base64
import logging
import hashlib
import M2Crypto
import os
import couchdbkit
import copy
import re
import json
import statsd
import time
import strict_rfc3339

from . import loadable_manager
from .utils import dynamicloader, quick_traceback

logger = logging.getLogger("habitat.parser")
statsd.init_statsd({'STATSD_BUCKET_PREFIX': 'habitat'})

__all__ = ['Parser', 'ParserModule']


[docs]class Parser(object): """ habitat's parser :class:`Parser` takes arbitrary unparsed payload telemetry and attempts to use each loaded :class:`ParserModule` to turn this telemetry into useful data. """ ascii_exp = re.compile("^[\\x20-\\x7E]+$") def __init__(self, config): """ On construction, it will: * Use ``config[daemon_name]`` as ``self.config`` (defaults to 'parser'). * Load modules from ``self.config["modules"]``. * Connects to CouchDB using ``self.config["couch_uri"]`` and ``config["couch_db"]``. """ config = copy.deepcopy(config) parser_config = config["parser"] self.loadable_manager = loadable_manager.LoadableManager(config) # loadable_manager used by ParserFiltering and ParserModules. self.filtering = ParserFiltering(config, self.loadable_manager) self.modules = [] for module in parser_config["modules"]: m = dynamicloader.load(module["class"]) dynamicloader.expecthasmethod(m, "pre_parse") dynamicloader.expecthasmethod(m, "parse") dynamicloader.expecthasnumargs(m.pre_parse, 1) dynamicloader.expecthasnumargs(m.parse, 2) module["module"] = m(self) self.modules.append(module) self.couch_server = couchdbkit.Server(config["couch_uri"]) self.db = self.couch_server[config["couch_db"]] @statsd.StatsdTimer.wrap('parser.time')
[docs] def parse(self, doc, initial_config=None): """ Attempts to parse telemetry information out of a new telemetry document *doc*. This function attempts to determine which of the loaded parser modules should be used to parse the message, and which payload_configuration document it should be given to do so (if *initial_config* is specified, no attempt will be made to find any other configuration document). The resulting parsed document is returned, or None is returned if no data could be parsed. Some field names in data["data"] are reserved, as indicated by a leading underscore. These fields may include: * ``_protocol`` which gives the parser module name that was used to decode this message From the UKHAS parser module in particular: * ``_sentence`` gives the ASCII sentence from the UKHAS parser Parser modules should be wary when outputting field names with leading underscores. """ data = None raw_data = base64.b64decode(doc['data']['_raw']) debug_type, debug_data = self._get_debug(raw_data) receiver_callsign = doc['receivers'].keys()[0] if '_fallbacks' in doc['data']: fallbacks = doc['data']['_fallbacks'] else: fallbacks = {} logger.info("Parsing [{type}] {data!r} ({id}) from {who}" .format(id=doc["_id"], data=debug_data, type=debug_type, who=receiver_callsign)) for module in self.modules: config = copy.deepcopy(initial_config) try: callsign = self._get_callsign(raw_data, fallbacks, module) config = self._get_config(callsign, config) data = self._get_data(raw_data, callsign, config, module) if fallbacks: for k, v in fallbacks.iteritems(): if k not in data: data[k] = v break except (CantGetCallsign, CantGetConfig, CantGetData): pass if type(data) is dict: doc['data'].update(data) logger.info("{module} parsed data from {callsign} successfully" .format(module=module["name"], callsign=callsign)) logger.debug("Parsed data: " + json.dumps(data, indent=2)) statsd.increment("parser.parsed") if "_protocol" in data: statsd.increment( "parser.protocol.{0}".format(data['_protocol'])) return doc else: logger.info("All attempts to parse failed") statsd.increment("parser.failed") return None
def _get_debug(self, raw_data): if self.ascii_exp.search(raw_data): statsd.increment("parser.ascii_doc") return 'ascii', raw_data else: statsd.increment("parser.binary_doc") return 'b64', base64.b64encode(raw_data) def _get_callsign(self, raw_data, fallbacks, module): """Attempt to find a callsign from the data.""" raw_data = self.filtering.pre_filter(raw_data, module) try: callsign = module["module"].pre_parse(raw_data) except CantParse as e: logger.debug("CantParse exception in {module}: {e}" .format(e=quick_traceback.oneline(e), module=module['name'])) statsd.increment("parser.{0}.cantparse".format(module['name'])) raise CantGetCallsign() except CantExtractCallsign as e: logger.debug("CantExtractCallsign exception in {m}: {e}" .format(e=quick_traceback.oneline(e), m=module['name'])) statsd.increment("parser.{0}.cantextractcallsign" .format(module['name'])) if 'payload' in fallbacks: logger.debug("Could not find callsign but using fallback.") statsd.increment("parser.fallback_callsign") return fallbacks['payload'] else: raise CantGetCallsign() return callsign def _get_config(self, callsign, config=None): """ Attempt to get a config doc given the callsign and maybe a provided config doc. """ if config and not self._callsign_in_config(callsign, config): logger.debug("Callsign {c!r} not found in configuration doc" .format(c=callsign)) raise CantGetConfig() elif config: if "_id" not in config: config["_id"] = None logger.debug("payload_configuration provided (id: {0})" .format(config["_id"])) return {"id": config["_id"], "payload_configuration": config} config = self._find_config_doc(callsign) if not config: logger.debug("No configuration doc for {callsign!r} found" .format(callsign=callsign)) statsd.increment("parser.no_config_doc") raise CantGetConfig() if "flight_id" in config: logger.debug("Selected payload_configuration {0} from flight {1} " "for {2!r}" .format(config["id"], config["flight_id"], callsign)) else: logger.debug("Selected payload_configuration {0} for {1!r}" .format(config["id"], callsign)) return config def _get_data(self, raw_data, callsign, config, module): """Attempt to parse data from what we know so far.""" sentences = config["payload_configuration"]["sentences"] for sentence_index, sentence in enumerate(sentences): if sentence["callsign"] != callsign: continue if sentence["protocol"] != module["name"]: continue data = self.filtering.intermediate_filter(raw_data, sentence) try: data = module["module"].parse(data, sentence) except (ValueError, KeyError) as e: logger.debug("Exception in {module} main parse: {e}" .format(module=module['name'], e=quick_traceback.oneline(e))) statsd.increment("parser.parse_exception") continue data = self.filtering.post_filter(data, sentence) data["_protocol"] = module["name"] data["_parsed"] = { "time_parsed": strict_rfc3339.now_to_rfc3339_utcoffset(), "payload_configuration": config["id"], "configuration_sentence_index": sentence_index } if "flight_id" in config: data["_parsed"]["flight"] = config["flight_id"] return data raise CantGetData() def _find_config_doc(self, callsign): """ Attempt to locate a payload_configuration document suitable for parsing data from *callsign* at the present moment in time. Resolution proceeds as: 1. Check all started-but-not-yet-ended (aka active) flights for a reference to a payload_configuration document that includes this callsign in at least one sentence. 2. If no active flights mention the callsign, obtain the single most recently created payload_configuration document that does and use it. Returns an object that contains the payload_configuration document ID, the flight ID if appropriate, and the payload_configuration:: { "id": <payload_configuration doc ID>, "payload_configuration": <payload_configuration doc>, "flight_id": <flight doc ID> } The returned document may have more than one sentence object, and each should be attempted in order. If no configuration can be found, None is returned. """ t = int(time.time()) flights = self.db.view("flight/end_start_including_payloads", include_docs=True, startkey=[t]) for flight in flights: if flight["key"][1] < t and flight["key"][3] == 1: if self._callsign_in_config(callsign, flight["doc"]): return { "id": flight["doc"]["_id"], "flight_id": flight["id"], "payload_configuration": flight["doc"] } config = self.db.view( "payload_configuration/callsign_time_created_index", startkey=[callsign, "inf"], include_docs=True, limit=1, descending=True ).first() # Note that we check the callsign is in this doc as if no configuration # has this callsign, the first document returned above will be for the # closest callsign alphabetically (and thus not useful). if config and self._callsign_in_config(callsign, config["doc"]): return { "id": config["id"], "payload_configuration": config["doc"] } return None def _callsign_in_config(self, callsign, config): return callsign in (s["callsign"] for s in config.get("sentences", []))
class ParserFiltering(object): """ Handle filtering of data during parsing. """ def __init__(self, config, lmgr): """ * Scans ``config["parser"]["certs_dir"]`` for CA and developer certificates. """ self.config = copy.deepcopy(config) self.loadable_manager = lmgr self.certificate_authorities = [] self.cert_path = self.config["parser"]["certs_dir"] ca_path = os.path.join(self.cert_path, 'ca') for f in os.listdir(ca_path): ca = M2Crypto.X509.load_cert(os.path.join(ca_path, f)) if ca.check_ca(): self.certificate_authorities.append(ca) else: raise ValueError("CA certificate is not a CA: {0}" .format(os.path.join(ca_path, f))) self.loaded_certs = {} def pre_filter(self, raw_data, module): """ Apply all the module's pre filters, in order, to the data and return the resulting filtered data. """ sentence = {"filters": {'pre': module.get('pre-filters', {})}} return self._apply_filters(raw_data, sentence, "pre", str) def intermediate_filter(self, raw_data, sentence): """ Apply all the intermediate (between getting the callsign and parsing) filters specified in the payload's configuration document and return the resulting filtered data. """ return self._apply_filters(raw_data, sentence, "intermediate", str) def post_filter(self, data, sentence): """ Apply all the post (after parsing) filters specified in the payload's configuration document and return the resulting filtered data. """ return self._apply_filters(data, sentence, "post", dict) def _apply_filters(self, data, sentence, filter_type, result_type): if "filters" in sentence: if filter_type in sentence["filters"]: for index, f in enumerate(sentence["filters"][filter_type]): whence = (filter_type, index) data = self._filter(data, f, result_type, whence) statsd.increment("parser.filters.{0}".format(filter_type)) return data def _filter(self, data, f, result_type, filter_whence): """ Load and run a filter from a dictionary specifying type, the relevant filter/code and maybe a config. Returns the filtered data, or leaves the data untouched if the filter could not be run. filter_whence is used merely for logging, and should be a tuple: (filter_type, filter_index); e.g. ("intermediate", 4). """ rollback = data data = copy.deepcopy(data) try: if f["type"] == "normal": fil = 'filters.' + f['filter'] filter_whence += ("normal", fil) data = self.loadable_manager.run(fil, f, data) elif f["type"] == "hotfix": filter_whence += ("hotfix", ) data = self._hotfix_filter(data, f) else: raise ValueError("Invalid filter type") if not data or not isinstance(data, result_type): raise ValueError("Hotfix returned no output or " "output of wrong type") except: logger.debug("Error while applying filter {0}: {1}" .format(filter_whence, quick_traceback.oneline())) return rollback else: return data def _sanity_check_hotfix(self, f): """Perform basic sanity checks on **f**""" if "code" not in f: raise ValueError("Hotfix didn't have any code") if "signature" not in f: raise ValueError("Hotfix didn't have a signature") if "certificate" not in f: raise ValueError("Hotfix didn't specify a certificate") if os.path.basename(f["certificate"]) != f["certificate"]: raise ValueError("Hotfix's specified certificate was invalid") def _verify_certificate(self, f, cert): """Check that the certificate is cryptographically signed by a key which is signed by a known CA.""" # Check the certificate is valid for ca_cert in self.certificate_authorities: if cert.verify(ca_cert.get_pubkey()): break raise ValueError("Certificate is not signed by a recognised CA.") # Check the signature is valid try: digest = hashlib.sha256(f["code"]).hexdigest() sig = base64.b64decode(f["signature"]) ok = cert.get_pubkey().get_rsa().verify(digest, sig, 'sha256') except (TypeError, M2Crypto.RSA.RSAError): statsd.increment("parser.filters.hotfix.invalid_signature") raise ValueError("Hotfix signature is not valid") if not ok: statsd.increment("parser.filters.hotfix.invalid_signature") raise ValueError("Hotfix signature is not valid") def _compile_hotfix(self, f): """Compile a hotfix into a function **f** in an empty namespace.""" logger.debug("Compiling a hotfix") body = "def f(data):\n" env = {} try: body += "\n".join(" " + l + "\n" for l in f["code"].split("\n")) code = compile(body, "<filter>", "exec") exec code in env except (SyntaxError, AttributeError, TypeError): statsd.increment("parser.filters.hotfix.compile_error") raise ValueError("Hotfix code didn't compile: " + repr(f)) return env def _hotfix_filter(self, data, f): """Load a filter specified by some code in the database. Check its authenticity by verifying its certificate, then run if OK.""" self._sanity_check_hotfix(f) cert = self._get_certificate(f["certificate"]) self._verify_certificate(f, cert) env = self._compile_hotfix(f) logger.debug("Executing a hotfix") statsd.increment("parser.filters.hotfix.executed") return env["f"](data) def _get_certificate(self, certname): """Fetch the specified certificate, returning the X509 object. Uses an instance cache to prevent too much filesystem I/O.""" if certname in self.loaded_certs: return self.loaded_certs[certname] cert_path = os.path.join(self.cert_path, "certs", certname) if os.path.exists(cert_path): try: cert = M2Crypto.X509.load_cert(cert_path) except (IOError, M2Crypto.X509.X509Error): raise ValueError("Certificate could not be loaded.") self.loaded_certs[certname] = cert return cert else: raise ValueError("Certificate could not be loaded.")
[docs]class ParserModule(object): """ Base class for real ParserModules to inherit from. **ParserModules** are classes which turn radio strings into useful data. They do not have to inherit from :class:`ParserModule`, but can if they want. They must implement :meth:`pre_parse` and :meth:`parse` as described below. """ def __init__(self, parser): self.parser = parser self.loadable_manager = parser.loadable_manager
[docs] def pre_parse(self, string): """ Go though *string* and attempt to extract a callsign, returning it as a string. If *string* is not parseable by this module, raise :py:class:`CantParse`. If *string* might be parseable but no callsign could be extracted, raise :py:class:`CantExtractCallsign`. """ raise ValueError()
[docs] def parse(self, string, config): """ Go through *string* which has been identified as the format this parser module should be able to parse, extracting the data as per the information in *config*, which is the ``sentence`` dictionary extracted from the payload's configuration document. """ raise ValueError()
class CantGetCallsign(Exception): # Parser internal use. pass class CantGetConfig(Exception): # Parser internal use. pass class CantGetData(Exception): # Parser internal use. pass class CantParse(Exception): """Parser module cannot parse the given sentence.""" pass class CantExtractCallsign(Exception): """ Parser submodule cannot find a callsign, though in theory might be able to parse the sentence if one were provided. """ pass