1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """Some text manipulation utility functions.
19
20
21 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
22 unquote, colorize_ansi
23 :group text manipulation: searchall, splitstrip
24 :sort: text formatting, text manipulation
25
26 :type ANSI_STYLES: dict(str)
27 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
28
29 :type ANSI_COLORS: dict(str)
30 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
31
32 :type ANSI_PREFIX: str
33 :var ANSI_PREFIX:
34 ANSI terminal code notifying the start of an ANSI escape sequence
35
36 :type ANSI_END: str
37 :var ANSI_END:
38 ANSI terminal code notifying the end of an ANSI escape sequence
39
40 :type ANSI_RESET: str
41 :var ANSI_RESET:
42 ANSI terminal code resetting format defined by a previous ANSI escape sequence
43 """
44 __docformat__ = "restructuredtext en"
45
46 import sys
47 import re
48 import os.path as osp
49 from warnings import warn
50 from unicodedata import normalize as _uninormalize
51 try:
52 from os import linesep
53 except ImportError:
54 linesep = '\n'
55
56 from logilab.common.deprecation import deprecated
57
58 MANUAL_UNICODE_MAP = {
59 u'\xa1': u'!',
60 u'\u0142': u'l',
61 u'\u2044': u'/',
62 u'\xc6': u'AE',
63 u'\xa9': u'(c)',
64 u'\xab': u'"',
65 u'\xe6': u'ae',
66 u'\xae': u'(r)',
67 u'\u0153': u'oe',
68 u'\u0152': u'OE',
69 u'\xd8': u'O',
70 u'\xf8': u'o',
71 u'\xbb': u'"',
72 u'\xdf': u'ss',
73 u'\u2013': u'-',
74 u'\u2019': u"'",
75 }
76
77 -def unormalize(ustring, ignorenonascii=None, substitute=None):
78 """replace diacritical characters with their corresponding ascii characters
79
80 Convert the unicode string to its long normalized form (unicode character
81 will be transform into several characters) and keep the first one only.
82 The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
83 replace all compatibility characters with their equivalents.
84
85 :type substitute: str
86 :param substitute: replacement character to use if decomposition fails
87
88 :see: Another project about ASCII transliterations of Unicode text
89 http://pypi.python.org/pypi/Unidecode
90 """
91
92 if ignorenonascii is not None:
93 warn("ignorenonascii is deprecated, use substitute named parameter instead",
94 DeprecationWarning, stacklevel=2)
95 if ignorenonascii:
96 substitute = ''
97 res = []
98 for letter in ustring[:]:
99 try:
100 replacement = MANUAL_UNICODE_MAP[letter]
101 except KeyError:
102 replacement = _uninormalize('NFKD', letter)[0]
103 if ord(replacement) >= 2 ** 7:
104 if substitute is None:
105 raise ValueError("can't deal with non-ascii based characters")
106 replacement = substitute
107 res.append(replacement)
108 return u''.join(res)
109
111 """remove optional quotes (simple or double) from the string
112
113 :type string: str or unicode
114 :param string: an optionally quoted string
115
116 :rtype: str or unicode
117 :return: the unquoted string (or the input string if it wasn't quoted)
118 """
119 if not string:
120 return string
121 if string[0] in '"\'':
122 string = string[1:]
123 if string[-1] in '"\'':
124 string = string[:-1]
125 return string
126
127
128 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
129 _NORM_SPACES_RGX = re.compile('\s+')
130
131 -def normalize_text(text, line_len=80, indent='', rest=False):
132 """normalize a text to display it with a maximum line size and
133 optionally arbitrary indentation. Line jumps are normalized but blank
134 lines are kept. The indentation string may be used to insert a
135 comment (#) or a quoting (>) mark for instance.
136
137 :type text: str or unicode
138 :param text: the input text to normalize
139
140 :type line_len: int
141 :param line_len: expected maximum line's length, default to 80
142
143 :type indent: str or unicode
144 :param indent: optional string to use as indentation
145
146 :rtype: str or unicode
147 :return:
148 the input text normalized to fit on lines with a maximized size
149 inferior to `line_len`, and optionally prefixed by an
150 indentation string
151 """
152 if rest:
153 normp = normalize_rest_paragraph
154 else:
155 normp = normalize_paragraph
156 result = []
157 for text in _BLANKLINES_RGX.split(text):
158 result.append(normp(text, line_len, indent))
159 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
160
161
163 """normalize a text to display it with a maximum line size and
164 optionally arbitrary indentation. Line jumps are normalized. The
165 indentation string may be used top insert a comment mark for
166 instance.
167
168 :type text: str or unicode
169 :param text: the input text to normalize
170
171 :type line_len: int
172 :param line_len: expected maximum line's length, default to 80
173
174 :type indent: str or unicode
175 :param indent: optional string to use as indentation
176
177 :rtype: str or unicode
178 :return:
179 the input text normalized to fit on lines with a maximized size
180 inferior to `line_len`, and optionally prefixed by an
181 indentation string
182 """
183 text = _NORM_SPACES_RGX.sub(' ', text)
184 line_len = line_len - len(indent)
185 lines = []
186 while text:
187 aline, text = splittext(text.strip(), line_len)
188 lines.append(indent + aline)
189 return linesep.join(lines)
190
192 """normalize a ReST text to display it with a maximum line size and
193 optionally arbitrary indentation. Line jumps are normalized. The
194 indentation string may be used top insert a comment mark for
195 instance.
196
197 :type text: str or unicode
198 :param text: the input text to normalize
199
200 :type line_len: int
201 :param line_len: expected maximum line's length, default to 80
202
203 :type indent: str or unicode
204 :param indent: optional string to use as indentation
205
206 :rtype: str or unicode
207 :return:
208 the input text normalized to fit on lines with a maximized size
209 inferior to `line_len`, and optionally prefixed by an
210 indentation string
211 """
212 toreport = ''
213 lines = []
214 line_len = line_len - len(indent)
215 for line in text.splitlines():
216 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
217 toreport = ''
218 while len(line) > line_len:
219
220 line, toreport = splittext(line, line_len)
221 lines.append(indent + line)
222 if toreport:
223 line = toreport + ' '
224 toreport = ''
225 else:
226 line = ''
227 if line:
228 lines.append(indent + line.strip())
229 return linesep.join(lines)
230
231
232 -def splittext(text, line_len):
233 """split the given text on space according to the given max line size
234
235 return a 2-uple:
236 * a line <= line_len if possible
237 * the rest of the text which has to be reported on another line
238 """
239 if len(text) <= line_len:
240 return text, ''
241 pos = min(len(text)-1, line_len)
242 while pos > 0 and text[pos] != ' ':
243 pos -= 1
244 if pos == 0:
245 pos = min(len(text), line_len)
246 while len(text) > pos and text[pos] != ' ':
247 pos += 1
248 return text[:pos], text[pos+1:].strip()
249
250
252 """return a list of stripped string by splitting the string given as
253 argument on `sep` (',' by default). Empty string are discarded.
254
255 >>> splitstrip('a, b, c , 4,,')
256 ['a', 'b', 'c', '4']
257 >>> splitstrip('a')
258 ['a']
259 >>>
260
261 :type string: str or unicode
262 :param string: a csv line
263
264 :type sep: str or unicode
265 :param sep: field separator, default to the comma (',')
266
267 :rtype: str or unicode
268 :return: the unquoted string (or the input string if it wasn't quoted)
269 """
270 return [word.strip() for word in string.split(sep) if word.strip()]
271
272 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)
273
274
276 """return the latest component of a string containing either an url of the
277 form <scheme>://<path> or a local file system path
278 """
279 if '://' in url_or_path:
280 return url_or_path.rstrip('/').rsplit('/', 1)
281 return osp.split(url_or_path.rstrip(osp.sep))
282
283
284 -def text_to_dict(text):
285 """parse multilines text containing simple 'key=value' lines and return a
286 dict of {'key': 'value'}. When the same key is encountered multiple time,
287 value is turned into a list containing all values.
288
289 >>> d = text_to_dict('''multiple=1
290 ... multiple= 2
291 ... single =3
292 ... ''')
293 >>> d['single']
294 '3'
295 >>> d['multiple']
296 ['1', '2']
297
298 """
299 res = {}
300 if not text:
301 return res
302 for line in text.splitlines():
303 line = line.strip()
304 if line and not line.startswith('#'):
305 key, value = [w.strip() for w in line.split('=', 1)]
306 if key in res:
307 try:
308 res[key].append(value)
309 except AttributeError:
310 res[key] = [res[key], value]
311 else:
312 res[key] = value
313 return res
314
315
316 _BLANK_URE = r'(\s|,)+'
317 _BLANK_RE = re.compile(_BLANK_URE)
318 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
319 __UNITS_URE = r'[a-zA-Z]+'
320 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE))
321 _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE,
322 __VALUE_URE))
323
324 BYTE_UNITS = {
325 "b": 1,
326 "kb": 1024,
327 "mb": 1024 ** 2,
328 "gb": 1024 ** 3,
329 "tb": 1024 ** 4,
330 }
331
332 TIME_UNITS = {
333 "ms": 0.0001,
334 "s": 1,
335 "min": 60,
336 "h": 60 * 60,
337 "d": 60 * 60 *24,
338 }
339
342 """Parse the string applying the units defined in units
343 (e.g.: "1.5m",{'m',60} -> 80).
344
345 :type string: str or unicode
346 :param string: the string to parse
347
348 :type units: dict (or any object with __getitem__ using basestring key)
349 :param units: a dict mapping a unit string repr to its value
350
351 :type inter: type
352 :param inter: used to parse every intermediate value (need __sum__)
353
354 :type blank_reg: regexp
355 :param blank_reg: should match every blank char to ignore.
356
357 :type value_reg: regexp with "value" and optional "unit" group
358 :param value_reg: match a value and it's unit into the
359 """
360 if inter is None:
361 inter = final
362 fstring = _BLANK_RE.sub('', string)
363 if not (fstring and _VALIDATION_RE.match(fstring)):
364 raise ValueError("Invalid unit string: %r." % string)
365 values = []
366 for match in value_reg.finditer(fstring):
367 dic = match.groupdict()
368 lit, unit = dic["value"], dic.get("unit")
369 value = inter(lit)
370 if unit is not None:
371 try:
372 value *= units[unit.lower()]
373 except KeyError:
374 raise KeyError('invalid unit %s. valid units are %s' %
375 (unit, units.keys()))
376 values.append(value)
377 return final(sum(values))
378
379
380 _LINE_RGX = re.compile('\r\n|\r+|\n')
381
383 """return a string with the match location underlined:
384
385 >>> import re
386 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon'))
387 il mange du bacon
388 ^^^^^
389 >>>
390
391 :type match: _sre.SRE_match
392 :param match: object returned by re.match, re.search or re.finditer
393
394 :type string: str or unicode
395 :param string:
396 the string on which the regular expression has been applied to
397 obtain the `match` object
398
399 :type underline_char: str or unicode
400 :param underline_char:
401 character to use to underline the matched section, default to the
402 carret '^'
403
404 :rtype: str or unicode
405 :return:
406 the original string with an inserted line to underline the match
407 location
408 """
409 start = match.start()
410 end = match.end()
411 string = _LINE_RGX.sub(linesep, string)
412 start_line_pos = string.rfind(linesep, 0, start)
413 if start_line_pos == -1:
414 start_line_pos = 0
415 result = []
416 else:
417 result = [string[:start_line_pos]]
418 start_line_pos += len(linesep)
419 offset = start - start_line_pos
420 underline = ' ' * offset + underline_char * (end - start)
421 end_line_pos = string.find(linesep, end)
422 if end_line_pos == -1:
423 string = string[start_line_pos:]
424 result.append(string)
425 result.append(underline)
426 else:
427 end = string[end_line_pos + len(linesep):]
428 string = string[start_line_pos:end_line_pos]
429 result.append(string)
430 result.append(underline)
431 result.append(end)
432 return linesep.join(result).rstrip()
433
434
435
436
437 ANSI_PREFIX = '\033['
438 ANSI_END = 'm'
439 ANSI_RESET = '\033[0m'
440 ANSI_STYLES = {
441 'reset': "0",
442 'bold': "1",
443 'italic': "3",
444 'underline': "4",
445 'blink': "5",
446 'inverse': "7",
447 'strike': "9",
448 }
449 ANSI_COLORS = {
450 'reset': "0",
451 'black': "30",
452 'red': "31",
453 'green': "32",
454 'yellow': "33",
455 'blue': "34",
456 'magenta': "35",
457 'cyan': "36",
458 'white': "37",
459 }
460
462 """return ansi escape code corresponding to color and style
463
464 :type color: str or None
465 :param color:
466 the color name (see `ANSI_COLORS` for available values)
467 or the color number when 256 colors are available
468
469 :type style: str or None
470 :param style:
471 style string (see `ANSI_COLORS` for available values). To get
472 several style effects at the same time, use a coma as separator.
473
474 :raise KeyError: if an unexistent color or style identifier is given
475
476 :rtype: str
477 :return: the built escape code
478 """
479 ansi_code = []
480 if style:
481 style_attrs = splitstrip(style)
482 for effect in style_attrs:
483 ansi_code.append(ANSI_STYLES[effect])
484 if color:
485 if color.isdigit():
486 ansi_code.extend(['38', '5'])
487 ansi_code.append(color)
488 else:
489 ansi_code.append(ANSI_COLORS[color])
490 if ansi_code:
491 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
492 return ''
493
495 """colorize message by wrapping it with ansi escape codes
496
497 :type msg: str or unicode
498 :param msg: the message string to colorize
499
500 :type color: str or None
501 :param color:
502 the color identifier (see `ANSI_COLORS` for available values)
503
504 :type style: str or None
505 :param style:
506 style string (see `ANSI_COLORS` for available values). To get
507 several style effects at the same time, use a coma as separator.
508
509 :raise KeyError: if an unexistent color or style identifier is given
510
511 :rtype: str or unicode
512 :return: the ansi escaped string
513 """
514
515 if color is None and style is None:
516 return msg
517 escape_code = _get_ansi_code(color, style)
518
519 if escape_code:
520 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
521 return msg
522
523 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
524
539