Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 22  unquote, colorize_ansi 
 23  :group text manipulation: searchall, splitstrip 
 24  :sort: text formatting, text manipulation 
 25   
 26  :type ANSI_STYLES: dict(str) 
 27  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 28   
 29  :type ANSI_COLORS: dict(str) 
 30  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 31   
 32  :type ANSI_PREFIX: str 
 33  :var ANSI_PREFIX: 
 34    ANSI terminal code notifying the start of an ANSI escape sequence 
 35   
 36  :type ANSI_END: str 
 37  :var ANSI_END: 
 38    ANSI terminal code notifying the end of an ANSI escape sequence 
 39   
 40  :type ANSI_RESET: str 
 41  :var ANSI_RESET: 
 42    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 43  """ 
 44  __docformat__ = "restructuredtext en" 
 45   
 46  import sys 
 47  import re 
 48  import os.path as osp 
 49  from warnings import warn 
 50  from unicodedata import normalize as _uninormalize 
 51  try: 
 52      from os import linesep 
 53  except ImportError: 
 54      linesep = '\n' # gae 
 55   
 56  from logilab.common.deprecation import deprecated 
 57   
 58  MANUAL_UNICODE_MAP = { 
 59      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 60      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 61      u'\u2044': u'/',  # FRACTION SLASH 
 62      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 63      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 64      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 65      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 66      u'\xae': u'(r)',  # REGISTERED SIGN 
 67      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 68      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 69      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 70      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 71      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 72      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 73      u'\u2013': u'-',  # HYPHEN 
 74      u'\u2019': u"'",  # SIMPLE QUOTE 
 75      } 
 76   
77 -def unormalize(ustring, ignorenonascii=None, substitute=None):
78 """replace diacritical characters with their corresponding ascii characters 79 80 Convert the unicode string to its long normalized form (unicode character 81 will be transform into several characters) and keep the first one only. 82 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. 83 replace all compatibility characters with their equivalents. 84 85 :type substitute: str 86 :param substitute: replacement character to use if decomposition fails 87 88 :see: Another project about ASCII transliterations of Unicode text 89 http://pypi.python.org/pypi/Unidecode 90 """ 91 # backward compatibility, ignorenonascii was a boolean 92 if ignorenonascii is not None: 93 warn("ignorenonascii is deprecated, use substitute named parameter instead", 94 DeprecationWarning, stacklevel=2) 95 if ignorenonascii: 96 substitute = '' 97 res = [] 98 for letter in ustring[:]: 99 try: 100 replacement = MANUAL_UNICODE_MAP[letter] 101 except KeyError: 102 replacement = _uninormalize('NFKD', letter)[0] 103 if ord(replacement) >= 2 ** 7: 104 if substitute is None: 105 raise ValueError("can't deal with non-ascii based characters") 106 replacement = substitute 107 res.append(replacement) 108 return u''.join(res)
109
110 -def unquote(string):
111 """remove optional quotes (simple or double) from the string 112 113 :type string: str or unicode 114 :param string: an optionally quoted string 115 116 :rtype: str or unicode 117 :return: the unquoted string (or the input string if it wasn't quoted) 118 """ 119 if not string: 120 return string 121 if string[0] in '"\'': 122 string = string[1:] 123 if string[-1] in '"\'': 124 string = string[:-1] 125 return string
126 127 128 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 129 _NORM_SPACES_RGX = re.compile('\s+') 130
131 -def normalize_text(text, line_len=80, indent='', rest=False):
132 """normalize a text to display it with a maximum line size and 133 optionally arbitrary indentation. Line jumps are normalized but blank 134 lines are kept. The indentation string may be used to insert a 135 comment (#) or a quoting (>) mark for instance. 136 137 :type text: str or unicode 138 :param text: the input text to normalize 139 140 :type line_len: int 141 :param line_len: expected maximum line's length, default to 80 142 143 :type indent: str or unicode 144 :param indent: optional string to use as indentation 145 146 :rtype: str or unicode 147 :return: 148 the input text normalized to fit on lines with a maximized size 149 inferior to `line_len`, and optionally prefixed by an 150 indentation string 151 """ 152 if rest: 153 normp = normalize_rest_paragraph 154 else: 155 normp = normalize_paragraph 156 result = [] 157 for text in _BLANKLINES_RGX.split(text): 158 result.append(normp(text, line_len, indent)) 159 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
160 161
162 -def normalize_paragraph(text, line_len=80, indent=''):
163 """normalize a text to display it with a maximum line size and 164 optionally arbitrary indentation. Line jumps are normalized. The 165 indentation string may be used top insert a comment mark for 166 instance. 167 168 :type text: str or unicode 169 :param text: the input text to normalize 170 171 :type line_len: int 172 :param line_len: expected maximum line's length, default to 80 173 174 :type indent: str or unicode 175 :param indent: optional string to use as indentation 176 177 :rtype: str or unicode 178 :return: 179 the input text normalized to fit on lines with a maximized size 180 inferior to `line_len`, and optionally prefixed by an 181 indentation string 182 """ 183 text = _NORM_SPACES_RGX.sub(' ', text) 184 line_len = line_len - len(indent) 185 lines = [] 186 while text: 187 aline, text = splittext(text.strip(), line_len) 188 lines.append(indent + aline) 189 return linesep.join(lines)
190
191 -def normalize_rest_paragraph(text, line_len=80, indent=''):
192 """normalize a ReST text to display it with a maximum line size and 193 optionally arbitrary indentation. Line jumps are normalized. The 194 indentation string may be used top insert a comment mark for 195 instance. 196 197 :type text: str or unicode 198 :param text: the input text to normalize 199 200 :type line_len: int 201 :param line_len: expected maximum line's length, default to 80 202 203 :type indent: str or unicode 204 :param indent: optional string to use as indentation 205 206 :rtype: str or unicode 207 :return: 208 the input text normalized to fit on lines with a maximized size 209 inferior to `line_len`, and optionally prefixed by an 210 indentation string 211 """ 212 toreport = '' 213 lines = [] 214 line_len = line_len - len(indent) 215 for line in text.splitlines(): 216 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 217 toreport = '' 218 while len(line) > line_len: 219 # too long line, need split 220 line, toreport = splittext(line, line_len) 221 lines.append(indent + line) 222 if toreport: 223 line = toreport + ' ' 224 toreport = '' 225 else: 226 line = '' 227 if line: 228 lines.append(indent + line.strip()) 229 return linesep.join(lines)
230 231
232 -def splittext(text, line_len):
233 """split the given text on space according to the given max line size 234 235 return a 2-uple: 236 * a line <= line_len if possible 237 * the rest of the text which has to be reported on another line 238 """ 239 if len(text) <= line_len: 240 return text, '' 241 pos = min(len(text)-1, line_len) 242 while pos > 0 and text[pos] != ' ': 243 pos -= 1 244 if pos == 0: 245 pos = min(len(text), line_len) 246 while len(text) > pos and text[pos] != ' ': 247 pos += 1 248 return text[:pos], text[pos+1:].strip()
249 250
251 -def splitstrip(string, sep=','):
252 """return a list of stripped string by splitting the string given as 253 argument on `sep` (',' by default). Empty string are discarded. 254 255 >>> splitstrip('a, b, c , 4,,') 256 ['a', 'b', 'c', '4'] 257 >>> splitstrip('a') 258 ['a'] 259 >>> 260 261 :type string: str or unicode 262 :param string: a csv line 263 264 :type sep: str or unicode 265 :param sep: field separator, default to the comma (',') 266 267 :rtype: str or unicode 268 :return: the unquoted string (or the input string if it wasn't quoted) 269 """ 270 return [word.strip() for word in string.split(sep) if word.strip()]
271 272 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) 273 274
275 -def split_url_or_path(url_or_path):
276 """return the latest component of a string containing either an url of the 277 form <scheme>://<path> or a local file system path 278 """ 279 if '://' in url_or_path: 280 return url_or_path.rstrip('/').rsplit('/', 1) 281 return osp.split(url_or_path.rstrip(osp.sep))
282 283
284 -def text_to_dict(text):
285 """parse multilines text containing simple 'key=value' lines and return a 286 dict of {'key': 'value'}. When the same key is encountered multiple time, 287 value is turned into a list containing all values. 288 289 >>> d = text_to_dict('''multiple=1 290 ... multiple= 2 291 ... single =3 292 ... ''') 293 >>> d['single'] 294 '3' 295 >>> d['multiple'] 296 ['1', '2'] 297 298 """ 299 res = {} 300 if not text: 301 return res 302 for line in text.splitlines(): 303 line = line.strip() 304 if line and not line.startswith('#'): 305 key, value = [w.strip() for w in line.split('=', 1)] 306 if key in res: 307 try: 308 res[key].append(value) 309 except AttributeError: 310 res[key] = [res[key], value] 311 else: 312 res[key] = value 313 return res
314 315 316 _BLANK_URE = r'(\s|,)+' 317 _BLANK_RE = re.compile(_BLANK_URE) 318 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 319 __UNITS_URE = r'[a-zA-Z]+' 320 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) 321 _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE, 322 __VALUE_URE)) 323 324 BYTE_UNITS = { 325 "b": 1, 326 "kb": 1024, 327 "mb": 1024 ** 2, 328 "gb": 1024 ** 3, 329 "tb": 1024 ** 4, 330 } 331 332 TIME_UNITS = { 333 "ms": 0.0001, 334 "s": 1, 335 "min": 60, 336 "h": 60 * 60, 337 "d": 60 * 60 *24, 338 } 339
340 -def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE, 341 value_reg=_VALUE_RE):
342 """Parse the string applying the units defined in units 343 (e.g.: "1.5m",{'m',60} -> 80). 344 345 :type string: str or unicode 346 :param string: the string to parse 347 348 :type units: dict (or any object with __getitem__ using basestring key) 349 :param units: a dict mapping a unit string repr to its value 350 351 :type inter: type 352 :param inter: used to parse every intermediate value (need __sum__) 353 354 :type blank_reg: regexp 355 :param blank_reg: should match every blank char to ignore. 356 357 :type value_reg: regexp with "value" and optional "unit" group 358 :param value_reg: match a value and it's unit into the 359 """ 360 if inter is None: 361 inter = final 362 fstring = _BLANK_RE.sub('', string) 363 if not (fstring and _VALIDATION_RE.match(fstring)): 364 raise ValueError("Invalid unit string: %r." % string) 365 values = [] 366 for match in value_reg.finditer(fstring): 367 dic = match.groupdict() 368 lit, unit = dic["value"], dic.get("unit") 369 value = inter(lit) 370 if unit is not None: 371 try: 372 value *= units[unit.lower()] 373 except KeyError: 374 raise KeyError('invalid unit %s. valid units are %s' % 375 (unit, units.keys())) 376 values.append(value) 377 return final(sum(values))
378 379 380 _LINE_RGX = re.compile('\r\n|\r+|\n') 381
382 -def pretty_match(match, string, underline_char='^'):
383 """return a string with the match location underlined: 384 385 >>> import re 386 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')) 387 il mange du bacon 388 ^^^^^ 389 >>> 390 391 :type match: _sre.SRE_match 392 :param match: object returned by re.match, re.search or re.finditer 393 394 :type string: str or unicode 395 :param string: 396 the string on which the regular expression has been applied to 397 obtain the `match` object 398 399 :type underline_char: str or unicode 400 :param underline_char: 401 character to use to underline the matched section, default to the 402 carret '^' 403 404 :rtype: str or unicode 405 :return: 406 the original string with an inserted line to underline the match 407 location 408 """ 409 start = match.start() 410 end = match.end() 411 string = _LINE_RGX.sub(linesep, string) 412 start_line_pos = string.rfind(linesep, 0, start) 413 if start_line_pos == -1: 414 start_line_pos = 0 415 result = [] 416 else: 417 result = [string[:start_line_pos]] 418 start_line_pos += len(linesep) 419 offset = start - start_line_pos 420 underline = ' ' * offset + underline_char * (end - start) 421 end_line_pos = string.find(linesep, end) 422 if end_line_pos == -1: 423 string = string[start_line_pos:] 424 result.append(string) 425 result.append(underline) 426 else: 427 end = string[end_line_pos + len(linesep):] 428 string = string[start_line_pos:end_line_pos] 429 result.append(string) 430 result.append(underline) 431 result.append(end) 432 return linesep.join(result).rstrip()
433 434 435 # Ansi colorization ########################################################### 436 437 ANSI_PREFIX = '\033[' 438 ANSI_END = 'm' 439 ANSI_RESET = '\033[0m' 440 ANSI_STYLES = { 441 'reset': "0", 442 'bold': "1", 443 'italic': "3", 444 'underline': "4", 445 'blink': "5", 446 'inverse': "7", 447 'strike': "9", 448 } 449 ANSI_COLORS = { 450 'reset': "0", 451 'black': "30", 452 'red': "31", 453 'green': "32", 454 'yellow': "33", 455 'blue': "34", 456 'magenta': "35", 457 'cyan': "36", 458 'white': "37", 459 } 460
461 -def _get_ansi_code(color=None, style=None):
462 """return ansi escape code corresponding to color and style 463 464 :type color: str or None 465 :param color: 466 the color name (see `ANSI_COLORS` for available values) 467 or the color number when 256 colors are available 468 469 :type style: str or None 470 :param style: 471 style string (see `ANSI_COLORS` for available values). To get 472 several style effects at the same time, use a coma as separator. 473 474 :raise KeyError: if an unexistent color or style identifier is given 475 476 :rtype: str 477 :return: the built escape code 478 """ 479 ansi_code = [] 480 if style: 481 style_attrs = splitstrip(style) 482 for effect in style_attrs: 483 ansi_code.append(ANSI_STYLES[effect]) 484 if color: 485 if color.isdigit(): 486 ansi_code.extend(['38', '5']) 487 ansi_code.append(color) 488 else: 489 ansi_code.append(ANSI_COLORS[color]) 490 if ansi_code: 491 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 492 return ''
493
494 -def colorize_ansi(msg, color=None, style=None):
495 """colorize message by wrapping it with ansi escape codes 496 497 :type msg: str or unicode 498 :param msg: the message string to colorize 499 500 :type color: str or None 501 :param color: 502 the color identifier (see `ANSI_COLORS` for available values) 503 504 :type style: str or None 505 :param style: 506 style string (see `ANSI_COLORS` for available values). To get 507 several style effects at the same time, use a coma as separator. 508 509 :raise KeyError: if an unexistent color or style identifier is given 510 511 :rtype: str or unicode 512 :return: the ansi escaped string 513 """ 514 # If both color and style are not defined, then leave the text as is 515 if color is None and style is None: 516 return msg 517 escape_code = _get_ansi_code(color, style) 518 # If invalid (or unknown) color, don't wrap msg with ansi codes 519 if escape_code: 520 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 521 return msg
522 523 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 524
525 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
526 for line in lines: 527 if line[:4] in ('--- ', '+++ '): 528 out.write(colorize_ansi(line, style['separator'])) 529 elif line[0] == '-': 530 out.write(colorize_ansi(line, style['remove'])) 531 elif line[0] == '+': 532 out.write(colorize_ansi(line, style['add'])) 533 elif line[:4] == '--- ': 534 out.write(colorize_ansi(line, style['separator'])) 535 elif line[:4] == '+++ ': 536 out.write(colorize_ansi(line, style['separator'])) 537 else: 538 out.write(line)
539