| 1 | # -*- coding: iso-8859-1 -*- |
|---|
| 2 | """ |
|---|
| 3 | Creole wiki markup parser |
|---|
| 4 | |
|---|
| 5 | See http://wikicreole.org/ for latest specs. |
|---|
| 6 | |
|---|
| 7 | Notes: |
|---|
| 8 | * No markup allowed in headings. |
|---|
| 9 | Creole 1.0 does not require us to support this. |
|---|
| 10 | * No markup allowed in table headings. |
|---|
| 11 | Creole 1.0 does not require us to support this. |
|---|
| 12 | * No (non-bracketed) generic url recognition: this is "mission impossible" |
|---|
| 13 | except if you want to risk lots of false positives. Only known protocols |
|---|
| 14 | are recognized. |
|---|
| 15 | * We do not allow ":" before "//" italic markup to avoid urls with |
|---|
| 16 | unrecognized schemes (like wtf://server/path) triggering italic rendering |
|---|
| 17 | for the rest of the paragraph. |
|---|
| 18 | |
|---|
| 19 | PyLucid Updates by the PyLucid team: |
|---|
| 20 | - Bugfixes and better html code style |
|---|
| 21 | - Make the image tag match more strict, so it doesn't clash with |
|---|
| 22 | django template tags |
|---|
| 23 | - Add a passthrough for all django template blocktags |
|---|
| 24 | - Add a passthrough for html code lines |
|---|
| 25 | |
|---|
| 26 | @copyright: 2007 MoinMoin:RadomirDopieralski (creole 0.5 implementation), |
|---|
| 27 | 2007 MoinMoin:ThomasWaldmann (updates) |
|---|
| 28 | 2008 PyLucid:JensDiemer (PyLucid patches) |
|---|
| 29 | @license: GNU GPL, see COPYING for details. |
|---|
| 30 | """ |
|---|
| 31 | |
|---|
| 32 | import re |
|---|
| 33 | |
|---|
| 34 | |
|---|
| 35 | class Rules: |
|---|
| 36 | """Hold all the rules for generating regular expressions.""" |
|---|
| 37 | |
|---|
| 38 | # For the inline elements: |
|---|
| 39 | proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc' |
|---|
| 40 | url = r'''(?P<url> |
|---|
| 41 | (^ | (?<=\s | [.,:;!?()/=])) |
|---|
| 42 | (?P<escaped_url>~)? |
|---|
| 43 | (?P<url_target> (?P<url_proto> %s ):\S+? ) |
|---|
| 44 | ($ | (?=\s | [,.:;!?()] (\s | $))) |
|---|
| 45 | )''' % proto |
|---|
| 46 | link = r'''(?P<link> |
|---|
| 47 | \[\[ |
|---|
| 48 | (?P<link_target>.+?) \s* |
|---|
| 49 | ([|] \s* (?P<link_text>.+?) \s*)? |
|---|
| 50 | ]] |
|---|
| 51 | )''' |
|---|
| 52 | |
|---|
| 53 | # link = r'''(?P<link1> |
|---|
| 54 | # \[\[ |
|---|
| 55 | # (?P<link_target1>.+?)\|(?P<link_text1>.+?) |
|---|
| 56 | # ]] |
|---|
| 57 | # )|(?P<link2> |
|---|
| 58 | # \[\[ |
|---|
| 59 | # (?P<link_target2> (%s)://[^ ]+) \s* (?P<link_text2>.+?) |
|---|
| 60 | # ]] |
|---|
| 61 | # )| |
|---|
| 62 | # \[\[(?P<internal_link>.+)\]\] |
|---|
| 63 | # ''' % proto |
|---|
| 64 | |
|---|
| 65 | #-------------------------------------------------------------------------- |
|---|
| 66 | # The image rule should not match on django template tags! So we make it |
|---|
| 67 | # more restricted. |
|---|
| 68 | # It matches only if... |
|---|
| 69 | # ...image target ends with a picture extention |
|---|
| 70 | # ...separator >|< and the image text exist |
|---|
| 71 | image = r'''(?P<image> |
|---|
| 72 | {{ |
|---|
| 73 | (?P<image_target>.+?(\.jpg|\.jpeg|\.gif|\.png)) \s* |
|---|
| 74 | (\| \s* (?P<image_text>.+?) \s*)? |
|---|
| 75 | }} |
|---|
| 76 | )(?i)''' |
|---|
| 77 | #-------------------------------------------------------------------------- |
|---|
| 78 | |
|---|
| 79 | macro_block = r'''(?P<macro_block> |
|---|
| 80 | \s* << (?P<macro_block_start>\w+) \s* (?P<macro_block_args>.*?) >> |
|---|
| 81 | (?P<macro_block_text>(.|\n)+?) |
|---|
| 82 | <</(?P=macro_block_start)>> \s* |
|---|
| 83 | )''' |
|---|
| 84 | |
|---|
| 85 | macro = r'''(?P<macro> |
|---|
| 86 | << |
|---|
| 87 | (?P<macro_name> \w+) (?P<macro_args>.*?) |
|---|
| 88 | >> |
|---|
| 89 | )''' |
|---|
| 90 | code = r'(?P<code> {{{ (?P<code_text>.*?) }}} )' |
|---|
| 91 | emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the // |
|---|
| 92 | # avoids italic rendering in urls with |
|---|
| 93 | # unknown protocols |
|---|
| 94 | strong = r'(?P<strong> \*\* )' |
|---|
| 95 | linebreak = r'(?P<linebreak> \\\\ )' |
|---|
| 96 | escape = r'(?P<escape> ~ (?P<escaped_char>\S) )' |
|---|
| 97 | char = r'(?P<char> . )' |
|---|
| 98 | |
|---|
| 99 | # For the block elements: |
|---|
| 100 | separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line |
|---|
| 101 | line = r'''(?P<line> ^\s*$ )''' # empty line that separates paragraphs |
|---|
| 102 | head = r'''(?P<head> |
|---|
| 103 | ^ |
|---|
| 104 | (?P<head_head>=+) \s* |
|---|
| 105 | (?P<head_text> .*? ) |
|---|
| 106 | =*$ |
|---|
| 107 | )''' |
|---|
| 108 | text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?' |
|---|
| 109 | list = r'''(?P<list> |
|---|
| 110 | ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $ |
|---|
| 111 | ( \n[ \t]* [*\#]+.* $ )* |
|---|
| 112 | )''' # Matches the whole list, separate items are parsed later. The |
|---|
| 113 | # list *must* start with a single bullet. |
|---|
| 114 | item = r'''^ \s* (?P<item> |
|---|
| 115 | (?P<item_head> [\#*]+) \s* |
|---|
| 116 | (?P<item_text> .*?) |
|---|
| 117 | ) \s* $''' # Matches single list items |
|---|
| 118 | pre = r'''(?P<pre> |
|---|
| 119 | ^{{{ \s* $ |
|---|
| 120 | (\n)? |
|---|
| 121 | (?P<pre_text> |
|---|
| 122 | ([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)? |
|---|
| 123 | (.|\n)+? |
|---|
| 124 | ) |
|---|
| 125 | (\n)? |
|---|
| 126 | ^}}} \s*$ |
|---|
| 127 | )''' |
|---|
| 128 | pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $' |
|---|
| 129 | |
|---|
| 130 | # Pass-through all django template blocktags |
|---|
| 131 | pass_block = r'''(?P<pass_block> |
|---|
| 132 | {% \s* (?P<pass_block_start>.+?) \s* (?P<pass_block_args>.*?) \s* %} |
|---|
| 133 | (\n|.)*? |
|---|
| 134 | {% \s* end(?P=pass_block_start) \s* %} |
|---|
| 135 | )''' |
|---|
| 136 | |
|---|
| 137 | pass_line = r'''\n(?P<pass_line> |
|---|
| 138 | (\n|\s)* |
|---|
| 139 | ({%.*?%})| |
|---|
| 140 | ({{.*?}}) |
|---|
| 141 | (\n|\s)* |
|---|
| 142 | )''' |
|---|
| 143 | pass_inline = r'''(?P<pass_inline> |
|---|
| 144 | ({%.*?%})| |
|---|
| 145 | ({{.*?}}) |
|---|
| 146 | )''' |
|---|
| 147 | |
|---|
| 148 | #Pass-through html code lines |
|---|
| 149 | html = r'''(?P<html> |
|---|
| 150 | ^[ \t]*<[a-zA-Z].*?<(/[a-zA-Z ]+?)>[ \t]*$ |
|---|
| 151 | )''' |
|---|
| 152 | |
|---|
| 153 | table = r'''^ \s*(?P<table> |
|---|
| 154 | [|].*? \s* |
|---|
| 155 | [|]? |
|---|
| 156 | ) \s* $''' |
|---|
| 157 | |
|---|
| 158 | # For splitting table cells: |
|---|
| 159 | cell = r''' |
|---|
| 160 | \| \s* |
|---|
| 161 | ( |
|---|
| 162 | (?P<head> [=][^|]+ ) | |
|---|
| 163 | (?P<cell> ( %s | [^|])+ ) |
|---|
| 164 | ) \s* |
|---|
| 165 | ''' % '|'.join([link, macro, image, code]) |
|---|
| 166 | |
|---|
| 167 | #-------------------------------------------------------------------------- |
|---|
| 168 | # blockelements = ( |
|---|
| 169 | # "head", "list", "pre", "code", "table", "separator", "macro", |
|---|
| 170 | # "pass_block", "pass_line", "html" |
|---|
| 171 | # ) |
|---|
| 172 | |
|---|
| 173 | class Parser: |
|---|
| 174 | """ |
|---|
| 175 | Parse the raw text and create a document object |
|---|
| 176 | that can be converted into output using Emitter. |
|---|
| 177 | """ |
|---|
| 178 | # For pre escaping, in creole 1.0 done with ~: |
|---|
| 179 | pre_escape_re = re.compile(Rules.pre_escape, re.M | re.X) |
|---|
| 180 | # for link descriptions: |
|---|
| 181 | link_re = re.compile( |
|---|
| 182 | '|'.join([Rules.image, Rules.linebreak, Rules.char]), |
|---|
| 183 | re.X | re.U |
|---|
| 184 | ) |
|---|
| 185 | item_re = re.compile(Rules.item, re.X | re.U | re.M) # for list items |
|---|
| 186 | cell_re = re.compile(Rules.cell, re.X | re.U) # for table cells |
|---|
| 187 | # For block elements: |
|---|
| 188 | block_re = re.compile( |
|---|
| 189 | '|'.join([ |
|---|
| 190 | Rules.pass_block, |
|---|
| 191 | Rules.pass_line, |
|---|
| 192 | Rules.macro_block, |
|---|
| 193 | Rules.html, |
|---|
| 194 | Rules.line, Rules.head, Rules.separator, Rules.pre, Rules.list, |
|---|
| 195 | Rules.table, Rules.text, |
|---|
| 196 | ]), |
|---|
| 197 | re.X | re.U | re.M |
|---|
| 198 | ) |
|---|
| 199 | # For inline elements: |
|---|
| 200 | inline_re = re.compile( |
|---|
| 201 | '|'.join([ |
|---|
| 202 | Rules.link, Rules.url, Rules.macro, |
|---|
| 203 | Rules.code, Rules.image, |
|---|
| 204 | Rules.pass_inline, |
|---|
| 205 | Rules.strong, Rules.emph, Rules.linebreak, |
|---|
| 206 | Rules.escape, Rules.char |
|---|
| 207 | ]), |
|---|
| 208 | re.X | re.U |
|---|
| 209 | ) |
|---|
| 210 | |
|---|
| 211 | def __init__(self, raw): |
|---|
| 212 | self.raw = raw |
|---|
| 213 | self.root = DocNode('document', None) |
|---|
| 214 | self.cur = self.root # The most recent document node |
|---|
| 215 | self.text = None # The node to add inline characters to |
|---|
| 216 | self.last_text_break = None # Last break node, inserted by _text_repl() |
|---|
| 217 | |
|---|
| 218 | #-------------------------------------------------------------------------- |
|---|
| 219 | |
|---|
| 220 | def cleanup_break(self, old_cur): |
|---|
| 221 | """ |
|---|
| 222 | remove unused end line breaks. |
|---|
| 223 | Should be called before a new block element. |
|---|
| 224 | e.g.: |
|---|
| 225 | <p>line one<br /> |
|---|
| 226 | line two<br /> <--- remove this br-tag |
|---|
| 227 | </p> |
|---|
| 228 | """ |
|---|
| 229 | if self.cur.children: |
|---|
| 230 | last_child = self.cur.children[-1] |
|---|
| 231 | if last_child.kind == "break": |
|---|
| 232 | del(self.cur.children[-1]) |
|---|
| 233 | |
|---|
| 234 | def _upto(self, node, kinds): |
|---|
| 235 | """ |
|---|
| 236 | Look up the tree to the first occurence |
|---|
| 237 | of one of the listed kinds of nodes or root. |
|---|
| 238 | Start at the node node. |
|---|
| 239 | """ |
|---|
| 240 | self.cleanup_break(node) # remove unused end line breaks. |
|---|
| 241 | while node.parent is not None and not node.kind in kinds: |
|---|
| 242 | node = node.parent |
|---|
| 243 | |
|---|
| 244 | return node |
|---|
| 245 | |
|---|
| 246 | def _upto_block(self): |
|---|
| 247 | self.cur = self._upto(self.cur, ('document',))# 'section', 'blockquote')) |
|---|
| 248 | |
|---|
| 249 | #__________________________________________________________________________ |
|---|
| 250 | # The _*_repl methods called for matches in regexps. Sometimes the |
|---|
| 251 | # same method needs several names, because of group names in regexps. |
|---|
| 252 | |
|---|
| 253 | def _pass_block_repl(self, groups): |
|---|
| 254 | """ Pass-through all django template blocktags """ |
|---|
| 255 | self._upto_block() |
|---|
| 256 | self.cur = self.root |
|---|
| 257 | DocNode("pass_block", self.cur, groups["pass_block"]) |
|---|
| 258 | self.text = None |
|---|
| 259 | _pass_block_start_repl = _pass_block_repl |
|---|
| 260 | _pass_block_end_repl = _pass_block_repl |
|---|
| 261 | |
|---|
| 262 | def _pass_line_repl(self, groups): |
|---|
| 263 | """ Pass-through all django tags witch is alone in a code line """ |
|---|
| 264 | self._upto_block() |
|---|
| 265 | self.cur = self.root |
|---|
| 266 | DocNode("pass_line", self.cur, groups["pass_line"]) |
|---|
| 267 | self.text = None |
|---|
| 268 | |
|---|
| 269 | def _pass_inline_repl(self, groups): |
|---|
| 270 | """ Pass-through all inline django tags""" |
|---|
| 271 | DocNode("pass_inline", self.cur, groups["pass_inline"]) |
|---|
| 272 | self.text = None |
|---|
| 273 | |
|---|
| 274 | def _html_repl(self, groups): |
|---|
| 275 | """ Pass-through html code """ |
|---|
| 276 | self._upto_block() |
|---|
| 277 | DocNode("html", self.root, groups["html"]) |
|---|
| 278 | self.text = None |
|---|
| 279 | |
|---|
| 280 | def _text_repl(self, groups): |
|---|
| 281 | # print "_text_repl()", self.cur.kind, groups.get('break') != None |
|---|
| 282 | if self.cur.kind in ('table', 'table_row', 'bullet_list', |
|---|
| 283 | 'number_list'): |
|---|
| 284 | self._upto_block() |
|---|
| 285 | |
|---|
| 286 | if self.cur.kind in ('document', 'section', 'blockquote'): |
|---|
| 287 | self.cur = DocNode('paragraph', self.cur) |
|---|
| 288 | |
|---|
| 289 | self.parse_inline(groups.get('text', u"")) |
|---|
| 290 | |
|---|
| 291 | if groups.get('break') and self.cur.kind in ('paragraph', |
|---|
| 292 | 'emphasis', 'strong', 'code'): |
|---|
| 293 | self.last_text_break = DocNode('break', self.cur, u"") |
|---|
| 294 | |
|---|
| 295 | self.text = None |
|---|
| 296 | _break_repl = _text_repl |
|---|
| 297 | |
|---|
| 298 | def _url_repl(self, groups): |
|---|
| 299 | """Handle raw urls in text.""" |
|---|
| 300 | if not groups.get('escaped_url'): |
|---|
| 301 | # this url is NOT escaped |
|---|
| 302 | target = groups.get('url_target', u"") |
|---|
| 303 | node = DocNode('link', self.cur) |
|---|
| 304 | node.content = target |
|---|
| 305 | DocNode('text', node, node.content) |
|---|
| 306 | self.text = None |
|---|
| 307 | else: |
|---|
| 308 | # this url is escaped, we render it as text |
|---|
| 309 | if self.text is None: |
|---|
| 310 | self.text = DocNode('text', self.cur, u"") |
|---|
| 311 | self.text.content += groups.get('url_target') |
|---|
| 312 | _url_target_repl = _url_repl |
|---|
| 313 | _url_proto_repl = _url_repl |
|---|
| 314 | _escaped_url = _url_repl |
|---|
| 315 | |
|---|
| 316 | def _link_repl(self, groups): |
|---|
| 317 | """Handle all kinds of links.""" |
|---|
| 318 | target = groups.get('link_target', u"") |
|---|
| 319 | text = (groups.get('link_text', u"") or u"").strip() |
|---|
| 320 | parent = self.cur |
|---|
| 321 | self.cur = DocNode('link', self.cur) |
|---|
| 322 | self.cur.content = target |
|---|
| 323 | self.text = None |
|---|
| 324 | re.sub(self.link_re, self._replace, text) |
|---|
| 325 | self.cur = parent |
|---|
| 326 | self.text = None |
|---|
| 327 | _link_target_repl = _link_repl |
|---|
| 328 | _link_text_repl = _link_repl |
|---|
| 329 | |
|---|
| 330 | def _add_macro(self, macro_name, macro_args, macro_text=u""): |
|---|
| 331 | # self._upto_block() |
|---|
| 332 | node = DocNode("macro", self.cur, macro_text.strip()) |
|---|
| 333 | node.macro_name = macro_name |
|---|
| 334 | node.macro_args = macro_args.strip() |
|---|
| 335 | self.text = None |
|---|
| 336 | |
|---|
| 337 | def _macro_block_repl(self, groups): |
|---|
| 338 | """Handles macros using the placeholder syntax.""" |
|---|
| 339 | #self.debug_groups(groups) |
|---|
| 340 | self._upto_block() |
|---|
| 341 | self.cur = self.root |
|---|
| 342 | self._add_macro( |
|---|
| 343 | macro_name = groups['macro_block_start'], |
|---|
| 344 | macro_text = groups.get('macro_block_text', u""), |
|---|
| 345 | macro_args = groups.get('macro_block_args', u""), |
|---|
| 346 | ) |
|---|
| 347 | self.text = None |
|---|
| 348 | _macro_block_start_repl = _macro_block_repl |
|---|
| 349 | _macro_block_args_repl = _macro_block_repl |
|---|
| 350 | _macro_block_text_repl = _macro_block_repl |
|---|
| 351 | |
|---|
| 352 | def _macro_repl(self, groups): |
|---|
| 353 | """Handles macros using the placeholder syntax.""" |
|---|
| 354 | macro_name = groups.get('macro_name', u"") |
|---|
| 355 | macro_args = groups.get('macro_args', u"") |
|---|
| 356 | self._add_macro(macro_name, macro_args) |
|---|
| 357 | self.text = None |
|---|
| 358 | |
|---|
| 359 | # text = (groups.get('macro_text', u"") or u"").strip() |
|---|
| 360 | # node = DocNode('macro', self.cur, name) |
|---|
| 361 | # node.args = groups.get('macro_args', u"") or '' |
|---|
| 362 | # DocNode('text', node, text or name) |
|---|
| 363 | # self.text = None |
|---|
| 364 | _macro_name_repl = _macro_repl |
|---|
| 365 | _macro_args_repl = _macro_repl |
|---|
| 366 | # _macro_text_repl = _macro_repl |
|---|
| 367 | |
|---|
| 368 | def _image_repl(self, groups): |
|---|
| 369 | """Handles images and attachemnts included in the page.""" |
|---|
| 370 | target = groups.get('image_target', u"").strip() |
|---|
| 371 | text = (groups.get('image_text', u"") or u"").strip() |
|---|
| 372 | node = DocNode("image", self.cur, target) |
|---|
| 373 | DocNode('text', node, text or node.content) |
|---|
| 374 | self.text = None |
|---|
| 375 | _image_target_repl = _image_repl |
|---|
| 376 | _image_text_repl = _image_repl |
|---|
| 377 | |
|---|
| 378 | def _separator_repl(self, groups): |
|---|
| 379 | self._upto_block() |
|---|
| 380 | DocNode('separator', self.cur) |
|---|
| 381 | |
|---|
| 382 | def _item_repl(self, groups): |
|---|
| 383 | bullet = groups.get('item_head', u"") |
|---|
| 384 | text = groups.get('item_text', u"") |
|---|
| 385 | if bullet[-1] == '#': |
|---|
| 386 | kind = 'number_list' |
|---|
| 387 | else: |
|---|
| 388 | kind = 'bullet_list' |
|---|
| 389 | level = len(bullet)-1 |
|---|
| 390 | lst = self.cur |
|---|
| 391 | # Find a list of the same kind and level up the tree |
|---|
| 392 | while (lst and |
|---|
| 393 | not (lst.kind in ('number_list', 'bullet_list') and |
|---|
| 394 | lst.level == level) and |
|---|
| 395 | not lst.kind in ('document', 'section', 'blockquote')): |
|---|
| 396 | lst = lst.parent |
|---|
| 397 | if lst and lst.kind == kind: |
|---|
| 398 | self.cur = lst |
|---|
| 399 | else: |
|---|
| 400 | # Create a new level of list |
|---|
| 401 | self.cur = self._upto(self.cur, |
|---|
| 402 | ('list_item', 'document', 'section', 'blockquote')) |
|---|
| 403 | self.cur = DocNode(kind, self.cur) |
|---|
| 404 | self.cur.level = level |
|---|
| 405 | self.cur = DocNode('list_item', self.cur) |
|---|
| 406 | self.cur.level = level+1 |
|---|
| 407 | self.parse_inline(text) |
|---|
| 408 | self.text = None |
|---|
| 409 | _item_text_repl = _item_repl |
|---|
| 410 | _item_head_repl = _item_repl |
|---|
| 411 | |
|---|
| 412 | def _list_repl(self, groups): |
|---|
| 413 | self.item_re.sub(self._replace, groups["list"]) |
|---|
| 414 | |
|---|
| 415 | def _head_repl(self, groups): |
|---|
| 416 | self._upto_block() |
|---|
| 417 | node = DocNode('header', self.cur, groups['head_text'].strip()) |
|---|
| 418 | node.level = len(groups['head_head']) |
|---|
| 419 | self.text = None |
|---|
| 420 | _head_head_repl = _head_repl |
|---|
| 421 | _head_text_repl = _head_repl |
|---|
| 422 | |
|---|
| 423 | def _table_repl(self, groups): |
|---|
| 424 | row = groups.get('table', '|').strip() |
|---|
| 425 | self.cur = self._upto(self.cur, ( |
|---|
| 426 | 'table', 'document', 'section', 'blockquote')) |
|---|
| 427 | if self.cur.kind != 'table': |
|---|
| 428 | self.cur = DocNode('table', self.cur) |
|---|
| 429 | tb = self.cur |
|---|
| 430 | tr = DocNode('table_row', tb) |
|---|
| 431 | |
|---|
| 432 | for m in self.cell_re.finditer(row): |
|---|
| 433 | cell = m.group('cell') |
|---|
| 434 | if cell: |
|---|
| 435 | text = cell.strip() |
|---|
| 436 | self.cur = DocNode('table_cell', tr) |
|---|
| 437 | self.text = None |
|---|
| 438 | else: |
|---|
| 439 | text = m.group('head').strip('= ') |
|---|
| 440 | self.cur = DocNode('table_head', tr) |
|---|
| 441 | self.text = DocNode('text', self.cur, u"") |
|---|
| 442 | self.parse_inline(text) |
|---|
| 443 | |
|---|
| 444 | self.cur = tb |
|---|
| 445 | self.text = None |
|---|
| 446 | |
|---|
| 447 | def _pre_repl(self, groups): |
|---|
| 448 | self._upto_block() |
|---|
| 449 | kind = groups.get('pre_kind', None) |
|---|
| 450 | text = groups.get('pre_text', u"") |
|---|
| 451 | def remove_tilde(m): |
|---|
| 452 | return m.group('indent') + m.group('rest') |
|---|
| 453 | text = self.pre_escape_re.sub(remove_tilde, text) |
|---|
| 454 | node = DocNode('preformatted', self.cur, text) |
|---|
| 455 | node.sect = kind or '' |
|---|
| 456 | self.text = None |
|---|
| 457 | _pre_text_repl = _pre_repl |
|---|
| 458 | _pre_head_repl = _pre_repl |
|---|
| 459 | _pre_kind_repl = _pre_repl |
|---|
| 460 | |
|---|
| 461 | def _line_repl(self, groups): |
|---|
| 462 | """ Transfer newline from the original markup into the html code """ |
|---|
| 463 | self._upto_block() |
|---|
| 464 | DocNode('line', self.cur, u"") |
|---|
| 465 | |
|---|
| 466 | def _code_repl(self, groups): |
|---|
| 467 | DocNode('code', self.cur, groups.get('code_text', u"").strip()) |
|---|
| 468 | self.text = None |
|---|
| 469 | _code_text_repl = _code_repl |
|---|
| 470 | _code_head_repl = _code_repl |
|---|
| 471 | |
|---|
| 472 | def _emph_repl(self, groups): |
|---|
| 473 | if self.cur.kind != 'emphasis': |
|---|
| 474 | self.cur = DocNode('emphasis', self.cur) |
|---|
| 475 | else: |
|---|
| 476 | self.cur = self._upto(self.cur, ('emphasis', )).parent |
|---|
| 477 | self.text = None |
|---|
| 478 | |
|---|
| 479 | def _strong_repl(self, groups): |
|---|
| 480 | if self.cur.kind != 'strong': |
|---|
| 481 | self.cur = DocNode('strong', self.cur) |
|---|
| 482 | else: |
|---|
| 483 | self.cur = self._upto(self.cur, ('strong', )).parent |
|---|
| 484 | self.text = None |
|---|
| 485 | |
|---|
| 486 | def _linebreak_repl(self, groups): |
|---|
| 487 | DocNode('break', self.cur, None) |
|---|
| 488 | self.text = None |
|---|
| 489 | |
|---|
| 490 | def _escape_repl(self, groups): |
|---|
| 491 | if self.text is None: |
|---|
| 492 | self.text = DocNode('text', self.cur, u"") |
|---|
| 493 | self.text.content += groups.get('escaped_char', u"") |
|---|
| 494 | |
|---|
| 495 | def _char_repl(self, groups): |
|---|
| 496 | if self.text is None: |
|---|
| 497 | self.text = DocNode('text', self.cur, u"") |
|---|
| 498 | self.text.content += groups.get('char', u"") |
|---|
| 499 | |
|---|
| 500 | #-------------------------------------------------------------------------- |
|---|
| 501 | |
|---|
| 502 | def _replace(self, match): |
|---|
| 503 | """Invoke appropriate _*_repl method. Called for every matched group.""" |
|---|
| 504 | groups = match.groupdict() |
|---|
| 505 | for name, text in groups.iteritems(): |
|---|
| 506 | if text is not None: |
|---|
| 507 | #if name != "char": print "%15s: %r" % (name, text) |
|---|
| 508 | #print "%15s: %r" % (name, text) |
|---|
| 509 | replace = getattr(self, '_%s_repl' % name) |
|---|
| 510 | replace(groups) |
|---|
| 511 | return |
|---|
| 512 | |
|---|
| 513 | def parse_inline(self, raw): |
|---|
| 514 | """Recognize inline elements inside blocks.""" |
|---|
| 515 | re.sub(self.inline_re, self._replace, raw) |
|---|
| 516 | |
|---|
| 517 | def parse_block(self, raw): |
|---|
| 518 | """Recognize block elements.""" |
|---|
| 519 | re.sub(self.block_re, self._replace, raw) |
|---|
| 520 | |
|---|
| 521 | def parse(self): |
|---|
| 522 | """Parse the text given as self.raw and return DOM tree.""" |
|---|
| 523 | # convert all lineendings to \n |
|---|
| 524 | text = self.raw.replace("\r\n", "\n").replace("\r", "\n") |
|---|
| 525 | self.parse_block(text) |
|---|
| 526 | return self.root |
|---|
| 527 | |
|---|
| 528 | |
|---|
| 529 | #-------------------------------------------------------------------------- |
|---|
| 530 | def debug(self, start_node=None): |
|---|
| 531 | """ |
|---|
| 532 | Display the current document tree |
|---|
| 533 | """ |
|---|
| 534 | print "_"*80 |
|---|
| 535 | |
|---|
| 536 | if start_node == None: |
|---|
| 537 | start_node = self.root |
|---|
| 538 | print " document tree:" |
|---|
| 539 | else: |
|---|
| 540 | print " tree from %s:" % start_node |
|---|
| 541 | |
|---|
| 542 | print "="*80 |
|---|
| 543 | def emit(node, ident=0): |
|---|
| 544 | for child in node.children: |
|---|
| 545 | print u"%s%s: %r" % (u" "*ident, child.kind, child.content) |
|---|
| 546 | emit(child, ident+4) |
|---|
| 547 | emit(start_node) |
|---|
| 548 | print "*"*80 |
|---|
| 549 | |
|---|
| 550 | def debug_groups(self, groups): |
|---|
| 551 | print "_"*80 |
|---|
| 552 | print " debug groups:" |
|---|
| 553 | for name, text in groups.iteritems(): |
|---|
| 554 | if text is not None: |
|---|
| 555 | print "%15s: %r" % (name, text) |
|---|
| 556 | print "-"*80 |
|---|
| 557 | |
|---|
| 558 | |
|---|
| 559 | |
|---|
| 560 | #------------------------------------------------------------------------------ |
|---|
| 561 | |
|---|
| 562 | |
|---|
| 563 | class DocNode: |
|---|
| 564 | """ |
|---|
| 565 | A node in the document. |
|---|
| 566 | """ |
|---|
| 567 | def __init__(self, kind='', parent=None, content=None): |
|---|
| 568 | self.children = [] |
|---|
| 569 | self.parent = parent |
|---|
| 570 | self.kind = kind |
|---|
| 571 | |
|---|
| 572 | if content: |
|---|
| 573 | content = unicode(content) |
|---|
| 574 | self.content = content |
|---|
| 575 | |
|---|
| 576 | if self.parent is not None: |
|---|
| 577 | self.parent.children.append(self) |
|---|
| 578 | |
|---|
| 579 | def __str__(self): |
|---|
| 580 | # return "DocNode kind '%s', content: %r" % (self.kind, self.content) |
|---|
| 581 | return "<DocNode %s: %r>" % (self.kind, self.content) |
|---|
| 582 | def __repr__(self): |
|---|
| 583 | return u"<DocNode %s: %r>" % (self.kind, self.content) |
|---|
| 584 | |
|---|
| 585 | def debug(self): |
|---|
| 586 | print "_"*80 |
|---|
| 587 | print "\tDocNode - debug:" |
|---|
| 588 | print "str(): %s" % self |
|---|
| 589 | print "attributes:" |
|---|
| 590 | for i in dir(self): |
|---|
| 591 | if i.startswith("_"): |
|---|
| 592 | continue |
|---|
| 593 | print "%20s: %r" % (i, getattr(self, i, "---")) |
|---|
| 594 | |
|---|
| 595 | |
|---|
| 596 | #------------------------------------------------------------------------------ |
|---|
| 597 | |
|---|
| 598 | |
|---|
| 599 | if __name__=="__main__": |
|---|
| 600 | txt = r"""== a headline |
|---|
| 601 | |
|---|
| 602 | Here is [[a internal]] link. |
|---|
| 603 | This is [[http://domain.tld|external links]]. |
|---|
| 604 | A [[internal links|different]] link name. |
|---|
| 605 | |
|---|
| 606 | Basics: **bold** or //italic// |
|---|
| 607 | or **//both//** or //**both**// |
|---|
| 608 | Force\\linebreak. |
|---|
| 609 | |
|---|
| 610 | The current page name: >{{ PAGE.name }}< great? |
|---|
| 611 | A {% lucidTag page_update_list count=10 %} PyLucid plugin |
|---|
| 612 | |
|---|
| 613 | {% sourcecode py %} |
|---|
| 614 | import sys |
|---|
| 615 | |
|---|
| 616 | sys.stdout("Hello World!") |
|---|
| 617 | {% endsourcecode %} |
|---|
| 618 | |
|---|
| 619 | A [[www.domain.tld|link]]. |
|---|
| 620 | a {{/image.jpg|My Image}} image |
|---|
| 621 | |
|---|
| 622 | no image: {{ foo|bar }}! |
|---|
| 623 | picture [[www.domain.tld | {{ foo.JPG | Foo }} ]] as a link |
|---|
| 624 | |
|---|
| 625 | END |
|---|
| 626 | |
|---|
| 627 | ==== Headline 1 |
|---|
| 628 | |
|---|
| 629 | {% a tag 1 %} |
|---|
| 630 | |
|---|
| 631 | ==== Headline 2 |
|---|
| 632 | |
|---|
| 633 | {% a tag 2 %} |
|---|
| 634 | |
|---|
| 635 | the end |
|---|
| 636 | """ |
|---|
| 637 | |
|---|
| 638 | txt = r""" |
|---|
| 639 | ==== Headline 1 |
|---|
| 640 | |
|---|
| 641 | The current page name: >{{ PAGE.name }}< great? |
|---|
| 642 | |
|---|
| 643 | {% a tag 1 %} |
|---|
| 644 | |
|---|
| 645 | ==== Headline 2 |
|---|
| 646 | |
|---|
| 647 | {% a tag 2 %} |
|---|
| 648 | |
|---|
| 649 | some text |
|---|
| 650 | |
|---|
| 651 | {% something arg1="foo" arg2="bar" arg2=3 %} |
|---|
| 652 | foobar |
|---|
| 653 | {% endsomething %} |
|---|
| 654 | |
|---|
| 655 | the end |
|---|
| 656 | """ |
|---|
| 657 | |
|---|
| 658 | txt = r"""A {% lucidTag page_update_list count=10 %} PyLucid plugin |
|---|
| 659 | |
|---|
| 660 | {% sourcecode py %} |
|---|
| 661 | import sys |
|---|
| 662 | |
|---|
| 663 | sys.stdout("Hello World!") |
|---|
| 664 | {% endsourcecode %} |
|---|
| 665 | A [[www.domain.tld|link]].""" |
|---|
| 666 | |
|---|
| 667 | txt = r""" |
|---|
| 668 | ==== Headline 1 |
|---|
| 669 | |
|---|
| 670 | On {% a tag 1 %} line |
|---|
| 671 | line two |
|---|
| 672 | |
|---|
| 673 | ==== Headline 2 |
|---|
| 674 | |
|---|
| 675 | {% a tag 2 %} |
|---|
| 676 | |
|---|
| 677 | A block: |
|---|
| 678 | {% block %} |
|---|
| 679 | <Foo:> {{ Bar }} |
|---|
| 680 | {% endblock %} |
|---|
| 681 | end block |
|---|
| 682 | |
|---|
| 683 | {% block1 arg="jo" %} |
|---|
| 684 | eofjwqp |
|---|
| 685 | {% endblock1 %} |
|---|
| 686 | |
|---|
| 687 | A block without the right end block: |
|---|
| 688 | {% block1 %} |
|---|
| 689 | 111 |
|---|
| 690 | {% endblock2 %} |
|---|
| 691 | BBB |
|---|
| 692 | |
|---|
| 693 | A block without endblock: |
|---|
| 694 | {% block3 %} |
|---|
| 695 | 222 |
|---|
| 696 | {% block3 %} |
|---|
| 697 | CCC |
|---|
| 698 | |
|---|
| 699 | the end""" |
|---|
| 700 | # txt = r''' |
|---|
| 701 | #<<jojo>> |
|---|
| 702 | #owrej |
|---|
| 703 | #<<code>> |
|---|
| 704 | #some code |
|---|
| 705 | #<</code>> |
|---|
| 706 | #a macro: |
|---|
| 707 | #<<code ext=.css>> |
|---|
| 708 | #/* Stylesheet */ |
|---|
| 709 | #form * { |
|---|
| 710 | # vertical-align:middle; |
|---|
| 711 | #} |
|---|
| 712 | #<</code>> |
|---|
| 713 | #the end |
|---|
| 714 | #<<code>> |
|---|
| 715 | #<<code>> |
|---|
| 716 | #jup |
|---|
| 717 | #<</code>> |
|---|
| 718 | #''' |
|---|
| 719 | |
|---|
| 720 | |
|---|
| 721 | print "-"*80 |
|---|
| 722 | p = Parser(txt) |
|---|
| 723 | document = p.parse() |
|---|
| 724 | p.debug() |
|---|
| 725 | |
|---|
| 726 | def test_rules(rules, txt): |
|---|
| 727 | def display_match(match): |
|---|
| 728 | groups = match.groupdict() |
|---|
| 729 | for name, text in groups.iteritems(): |
|---|
| 730 | if name != "char" and text != None: |
|---|
| 731 | print "%13s: %r" % (name, text) |
|---|
| 732 | re.sub(rules, display_match, txt) |
|---|
| 733 | |
|---|
| 734 | # print "_"*80 |
|---|
| 735 | # print "plain block rules match:" |
|---|
| 736 | # test_rules(Parser("").block_re, txt) |
|---|
| 737 | # |
|---|
| 738 | # print "_"*80 |
|---|
| 739 | # print "plain inline rules match:" |
|---|
| 740 | # test_rules(Parser("").inline_re, txt) |
|---|
| 741 | |
|---|
| 742 | print "---END---" |
|---|