Package logsparser :: Module lognormalizer
[frames] | no frames]

Source Code for Module logsparser.lognormalizer

  1  # -*- python -*- 
  2   
  3  # pylogsparser - Logs parsers python library 
  4  # 
  5  # Copyright (C) 2011 Wallix Inc. 
  6  # 
  7  # This library is free software; you can redistribute it and/or modify it 
  8  # under the terms of the GNU Lesser General Public License as published by the 
  9  # Free Software Foundation; either version 2.1 of the License, or (at your 
 10  # option) any later version. 
 11  # 
 12  # This library is distributed in the hope that it will be useful, but WITHOUT 
 13  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 14  # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 
 15  # details. 
 16  # 
 17  # You should have received a copy of the GNU Lesser General Public License 
 18  # along with this library; if not, write to the Free Software Foundation, Inc., 
 19  # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
 20  # 
 21   
 22   
 23  """This module exposes the L{LogNormalizer} class that can be used for 
 24  higher-level management of the normalization flow. 
 25  Using this module is in no way mandatory in order to benefit from 
 26  the normalization system; the C{LogNormalizer} class provides basic facilities 
 27  for further integration in a wider project (web services, ...). 
 28  """ 
 29   
 30  import os 
 31  import uuid as _UUID_ 
 32  import warnings 
 33  import StringIO 
 34   
 35  from normalizer import Normalizer 
 36  from lxml.etree import parse, DTD, fromstring as XMLfromstring 
 37   
38 -class LogNormalizer():
39 """Basic normalization flow manager. 40 Normalizers definitions are loaded from a path and checked against the DTD. 41 If the definitions are syntactically correct, the normalizers are 42 instantiated and populate the manager's cache. 43 Normalization priormority is established as follows: 44 45 * Maximum priority assigned to normalizers where the "appliedTo" tag is set 46 to "raw". They MUST be mutually exclusive. 47 * Medium priority assigned to normalizers where the "appliedTo" tag is set 48 to "body". 49 * Lowest priority assigned to any remaining normalizers. 50 51 Some extra treatment is also done prior and after the log normalization: 52 53 * Assignment of a unique ID, under the tag "uuid" 54 * Conversion of date tags to UTC, if the "_timezone" was set prior to 55 the normalization process.""" 56
57 - def __init__(self, normalizers_paths, active_normalizers = {}):
58 """ 59 Instantiates a flow manager. The default behavior is to activate every 60 available normalizer. 61 62 @param normalizers_paths: a list of absolute paths to the normalizer 63 XML definitions to use or a just a single path as str. 64 @param active_normalizers: a dictionary of active normalizers 65 in the form {name: [True|False]}. 66 """ 67 if not isinstance(normalizers_paths, list or tuple): 68 normalizers_paths = [normalizers_paths,] 69 self.normalizers_paths = normalizers_paths 70 self.active_normalizers = active_normalizers 71 self.dtd, self.ctt, self.ccb = None, None, None 72 73 # Walk through paths for normalizer.dtd and common_tagTypes.xml 74 # /!\ dtd file and common elements will be overrriden if present in 75 # many directories. 76 for norm_path in self.normalizers_paths: 77 if not os.path.isdir(norm_path): 78 raise ValueError, "Invalid normalizer directory : %s" % norm_path 79 dtd = os.path.join(norm_path, 'normalizer.dtd') 80 ctt = os.path.join(norm_path, 'common_tagTypes.xml') 81 ccb = os.path.join(norm_path, 'common_callBacks.xml') 82 if os.path.isfile(dtd): 83 self.dtd = DTD(open(dtd)) 84 if os.path.isfile(ctt): 85 self.ctt = ctt 86 if os.path.isfile(ccb): 87 self.ccb = ccb 88 # Technically the common elements files should NOT be mandatory. 89 # But many normalizers use them, so better safe than sorry. 90 if not self.dtd or not self.ctt or not self.ccb: 91 raise StandardError, "Missing DTD or common library files" 92 self._cache = [] 93 self.reload()
94
95 - def reload(self):
96 """Refreshes this instance's normalizers pool.""" 97 self.normalizers = { 'raw' : [], 'body' : [] } 98 for path in self.iter_normalizer(): 99 norm = parse(open(path)) 100 if not self.dtd.validate(norm): 101 warnings.warn('Skipping %s : invalid DTD' % path) 102 print 'invalid normalizer ', path 103 else: 104 normalizer = Normalizer(norm, self.ctt, self.ccb) 105 normalizer.uuid = self._compute_norm_uuid(normalizer) 106 self.normalizers.setdefault(normalizer.appliedTo, []) 107 self.normalizers[normalizer.appliedTo].append(normalizer) 108 self.activate_normalizers()
109
110 - def _compute_norm_uuid(self, normalizer):
111 return "%s-%s" % (normalizer.name, normalizer.version)
112
113 - def iter_normalizer(self):
114 """ Iterates through normalizers and returns the normalizers' paths. 115 116 @return: a generator of absolute paths. 117 """ 118 for path in self.normalizers_paths: 119 for root, dirs, files in os.walk(path): 120 for name in files: 121 if not name.startswith('common_tagTypes') and \ 122 not name.startswith('common_callBacks') and \ 123 name.endswith('.xml'): 124 yield os.path.join(root, name)
125
126 - def __len__(self):
127 """ Returns the amount of available normalizers. 128 """ 129 return len([n for n in self.iter_normalizer()])
130
131 - def update_normalizer(self, raw_xml_contents, name = None, dir_path = None ):
132 """used to add or update a normalizer. 133 @param raw_xml_contents: XML description of normalizer as flat XML. It 134 must comply to the DTD. 135 @param name: if set, the XML description will be saved as name.xml. 136 If left blank, name will be fetched from the XML description. 137 @param dir_path: the path to the directory where to copy the given 138 normalizer. 139 """ 140 path = self.normalizers_paths[0] 141 if dir_path: 142 if dir_path in self.normalizers_paths: 143 path = dir_path 144 xmlconf = XMLfromstring(raw_xml_contents).getroottree() 145 if not self.dtd.validate(xmlconf): 146 raise ValueError, "This definition file does not follow the normalizers DTD :\n\n%s" % \ 147 self.dtd.error_log.filter_from_errors() 148 if not name: 149 name = xmlconf.getroot().get('name') 150 if not name.endswith('.xml'): 151 name += '.xml' 152 xmlconf.write(open(os.path.join(path, name), 'w'), 153 encoding = 'utf8', 154 method = 'xml', 155 pretty_print = True) 156 self.reload()
157
158 - def get_normalizer_by_uuid(self, uuid):
159 """Returns normalizer by uuid.""" 160 try: 161 norm = [ u for u in sum(self.normalizers.values(), []) if u.uuid == uuid][0] 162 return norm 163 except: 164 raise ValueError, "Normalizer uuid : %s not found" % uuid
165
166 - def get_normalizer_source(self, uuid):
167 """Returns the raw XML source of normalizer uuid.""" 168 return self.get_normalizer_by_uuid(uuid).get_source()
169
170 - def get_normalizer_path(self, uuid):
171 """Returns the filesystem path of a normalizer.""" 172 return self.get_normalizer_by_uuid(uuid).sys_path
173 174
175 - def activate_normalizers(self):
176 """Activates normalizers according to what was set by calling 177 set_active_normalizers. If no call to the latter function has been 178 made so far, this method activates every normalizer.""" 179 if not self.active_normalizers: 180 self.active_normalizers = dict([ (n.uuid, True) for n in \ 181 sum([ v for v in self.normalizers.values()], []) ]) 182 # fool-proof the list 183 self.set_active_normalizers(self.active_normalizers) 184 # build an ordered cache to speed things up 185 self._cache = [] 186 # First normalizers to apply are the "raw" ones. 187 for norm in self.normalizers['raw']: 188 # consider the normalizer to be inactive if not 189 # explicitly in our list 190 if self.active_normalizers.get(norm.uuid, False): 191 self._cache.append(norm) 192 # Then, apply the applicative normalization on "body": 193 for norm in self.normalizers['body']: 194 if self.active_normalizers.get(norm.uuid, False): 195 self._cache.append(norm) 196 # Then, apply everything else 197 for norm in sum([ self.normalizers[u] for u in self.normalizers 198 if u not in ['raw', 'body']], []): 199 if self.active_normalizers.get(norm.uuid, False): 200 self._cache.append(norm)
201
202 - def get_active_normalizers(self):
203 """Returns a dictionary of normalizers; keys are normalizers' uuid and 204 values are True|False according to the normalizer's activation state.""" 205 return self.active_normalizers
206
207 - def set_active_normalizers(self, norms = {}):
208 """Sets the active/inactive normalizers. Default behavior is to 209 deactivate every normalizer. 210 211 @param norms: a dictionary, similar to the one returned by 212 get_active_normalizers.""" 213 default = dict([ (n.uuid, False) for n in \ 214 sum([ v for v in self.normalizers.values()], []) ]) 215 default.update(norms) 216 self.active_normalizers = default
217
218 - def lognormalize(self, data):
219 """ This method is the entry point to normalize data (a log). 220 221 data is passed through every activated normalizer 222 and extra tagging occurs accordingly. 223 224 data receives also an extra uuid tag. 225 226 @param data: must be a dictionary with at least a key 'raw' or 'body' 227 with BaseString values (preferably Unicode). 228 229 Here an example : 230 >>> from logsparser import lognormalizer 231 >>> from pprint import pprint 232 >>> ln = lognormalizer.LogNormalizer('/usr/local/share/normalizers/') 233 >>> mylog = {'raw' : 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)'} 234 >>> ln.lognormalize(mylog) 235 >>> pprint mylog 236 {'body': '(root) CMD (/srv/git/redmine-changesets.sh)', 237 'date': datetime.datetime(2011, 7, 18, 15, 35, 1), 238 'pid': '14338', 239 'program': '/USR/SBIN/CRON', 240 'raw': 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)', 241 'source': 'zoo', 242 'uuid': 70851882840934161193887647073096992594L} 243 """ 244 data = self.uuidify(data) 245 data = self.normalize(data)
246 247 248 # some more functions for clarity
249 - def uuidify(self, log):
250 """Adds a unique UID to the normalized log.""" 251 log["uuid"] = _UUID_.uuid4().int 252 return log
253
254 - def normalize(self, log):
255 """plain normalization.""" 256 for norm in self._cache: 257 log = norm.normalize(log) 258 return log
259
260 - def _normalize(self, log):
261 """Used for testing only, the normalizers' tags prerequisite are 262 deactivated.""" 263 for norm in self._cache: 264 log = norm.normalize(log, do_not_check_prereq = True) 265 return log
266