| 21 | | |
| 22 | | SUB_HTML_RE = re.compile(r"(?u)<(\w+)[^>]*>|(\w+)|</(\w+)>") |
| 23 | | |
| 24 | | class SubHtml(object): |
| 25 | | """ |
| 26 | | For replacing words in html code. But only if the word is not in a skip_tag. |
| 27 | | |
| 28 | | >>> class LexiconData(dict): |
| 29 | | ... def __call__(self, word_lower, word): |
| 30 | | ... return " *%s*\\n\\n" % word.upper() |
| 31 | | >>> lexicon_data = LexiconData({"foo": None, "bar": None}) |
| 32 | | >>> s = SubHtml(lexicon_data, skip_tags=["a"]) |
| 33 | | >>> s.process('<html><p><a href="Foo=Bar"><strong>Foo</strong> Bar</a>Foo Bar</p></html>') |
| 34 | | '<html><p><a href="Foo=Bar"><strong>Foo</strong> Bar</a>*FOO* *BAR*</p></html>' |
| 35 | | """ |
| 36 | | def __init__(self, lexicon_data, skip_tags=[]): |
| | 23 | import HTMLParser |
| | 24 | from xml.sax.saxutils import quoteattr |
| | 25 | |
| | 26 | |
| | 27 | if __name__ == "__main__": |
| | 28 | os.environ['DJANGO_SETTINGS_MODULE'] = "pylucid_project.settings" |
| | 29 | virtualenv_file = "../../../../../bin/activate_this.py" |
| | 30 | execfile(virtualenv_file, dict(__file__=virtualenv_file)) |
| | 31 | |
| | 32 | |
| | 33 | |
| | 34 | class NoneHTMLParser(object, HTMLParser.HTMLParser): |
| | 35 | """ |
| | 36 | Parse the html code with HTMLParser and rebuilt it in self.html |
| | 37 | FIXME: Changes from original html to regenerated: |
| | 38 | - entityref lile umlaute would be replace with the real character, e.g.: ä -> ä |
| | 39 | - spaces can be changed, e.g: |
| | 40 | old: <a href="foo" >bar</a> |
| | 41 | new: <a href="foo">bar</a> |
| | 42 | old: <br/> |
| | 43 | new: <br /> |
| | 44 | - empty attributes are made xhtml conform, e.g: |
| | 45 | old: <td nowrap> |
| | 46 | new: <td nowarp="nowarp"> |
| | 47 | - add quote sign to attributes without quote sign, e.g: |
| | 48 | old: <table border=1> |
| | 49 | new: <table border="1"> |
| | 50 | - newlines in tags are removes: |
| | 51 | old: <link\nrel="... |
| | 52 | new: <link rel="... |
| | 53 | - attribute quoting can be changed: |
| | 54 | old: title="foo "bar"" |
| | 55 | new: title='foo "bar"' |
| | 56 | """ |
| | 57 | def __init__(self): |
| | 58 | # Note: HTMLPaser is a oldstyle class! |
| | 59 | self.reset() # Initialize and reset this HTMLParser instance. |
| | 60 | |
| | 61 | self.html = "" |
| | 62 | |
| | 63 | def _add_attrs(self, attrs): |
| | 64 | if attrs: |
| | 65 | attr_list = [] |
| | 66 | for attr, value in attrs: |
| | 67 | if value is None: |
| | 68 | # convert empty attrs to xhtml conform attributes |
| | 69 | # e.g.: <td nowrap> ->> <td nowarp="nowarp"> |
| | 70 | value = attr |
| | 71 | # FIXME: original quoting can be changed |
| | 72 | # e.g: title="foo "bar"" -> title='foo "bar"' |
| | 73 | value = quoteattr(value) |
| | 74 | attr_list.append('%s=%s' % (attr, value)) |
| | 75 | |
| | 76 | self.html += " " + " ".join(attr_list) |
| | 77 | |
| | 78 | def handle_startendtag(self, tag, attrs): |
| | 79 | # print "handle start+end tag: %r attrs: %r" % (tag, attrs) |
| | 80 | self.html += "<" + tag |
| | 81 | self._add_attrs(attrs) |
| | 82 | self.html += " />" |
| | 83 | |
| | 84 | def handle_starttag(self, tag, attrs): |
| | 85 | # print "handle start tag: %r attrs: %r" % (tag, attrs) |
| | 86 | self.html += "<" + tag |
| | 87 | self._add_attrs(attrs) |
| | 88 | self.html += ">" |
| | 89 | |
| | 90 | def handle_endtag(self, tag): |
| | 91 | # print "handle end tag: %r" % tag |
| | 92 | self.html += "</%s>" % tag |
| | 93 | |
| | 94 | def handle_charref(self, name): |
| | 95 | # print "handle character reference: %r" % name |
| | 96 | self.html += "&#%s;" % name |
| | 97 | |
| | 98 | def handle_entityref(self, name): |
| | 99 | # print "handle entity reference: %r" % name |
| | 100 | self.html += "&%s;" % name |
| | 101 | |
| | 102 | def handle_data(self, data): |
| | 103 | # print "handle data: %r" % data |
| | 104 | self.html += data |
| | 105 | |
| | 106 | def handle_comment(self, data): |
| | 107 | # print "handle comment: %r" % data |
| | 108 | self.html += "<!--%s-->" % data |
| | 109 | |
| | 110 | def handle_decl(self, decl): |
| | 111 | # print "handle declaration: %r" % decl |
| | 112 | self.html += "<!%s>" % decl |
| | 113 | |
| | 114 | def handle_pi(self, data): |
| | 115 | print "handle processing instruction:", data |
| | 116 | |
| | 117 | def unknown_decl(self, data): |
| | 118 | self.error("unknown declaration: %r" % (data,)) |
| | 119 | |
| | 120 | |
| | 121 | |
| | 122 | class SubHtml(NoneHTMLParser): |
| | 123 | """ |
| | 124 | replace all lexicon words in handle_data() |
| | 125 | """ |
| | 126 | def __init__(self, lexicon_data, skip_tags): |
| | 127 | super(SubHtml, self).__init__() |
| | 128 | |
| 40 | | self.in_skip_tag = None |
| 41 | | |
| 42 | | def sub(self, m): |
| 43 | | if self.in_skip_tag: # We are in a skip_tags |
| 44 | | close_tag = m.group(3) |
| 45 | | if close_tag == self.in_skip_tag: # The last skip_tag was closed |
| 46 | | self.in_skip_tag = None |
| 47 | | return m.group() |
| 48 | | |
| 49 | | tag = m.group(1) # Open html tag |
| 50 | | if tag: |
| 51 | | if tag.lower() in self.skip_tags: |
| 52 | | self.in_skip_tag = tag |
| 53 | | return m.group() |
| 54 | | |
| 55 | | word = m.group(2) # A word from the text |
| 56 | | if word: |
| 57 | | word_lower = word.lower() |
| 58 | | if word_lower in self.lexicon_data: |
| 59 | | return self.lexicon_data(word_lower, word).strip() |
| 60 | | |
| 61 | | return m.group() |
| 62 | | |
| 63 | | def process(self, html): |
| 64 | | return SUB_HTML_RE.sub(self.sub, html) |
| | 133 | |
| | 134 | self.regex = self._build_regex() # Build the regexp to find all lexicon words |
| | 135 | |
| | 136 | self.in_skip_tag = None # Storage if we are in a skip_tags |
| | 137 | |
| | 138 | def _build_regex(self): |
| | 139 | keys = self.lexicon_data.keys() |
| | 140 | |
| | 141 | # Sort longest to shortest |
| | 142 | keys.sort(cmp=lambda x, y: cmp(len(y), len(x))) |
| | 143 | |
| | 144 | # match on all existing keys with ignorecase |
| | 145 | regex = re.compile( |
| | 146 | "(?<=[\s\>])(%s)(?=[\s\<\.,:])" % "|".join(keys), |
| | 147 | re.IGNORECASE | re.UNICODE | re.MULTILINE |
| | 148 | ) |
| | 149 | return regex |
| | 150 | |
| | 151 | def handle_starttag(self, tag, attrs): |
| | 152 | super(SubHtml, self).handle_starttag(tag, attrs) |
| | 153 | # print "handle start tag: %r attrs: %r" % (tag, attrs) |
| | 154 | if tag in self.skip_tags: |
| | 155 | self.in_skip_tag = tag |
| | 156 | |
| | 157 | def handle_endtag(self, tag): |
| | 158 | super(SubHtml, self).handle_endtag(tag) |
| | 159 | # print "handle end tag: %r" % tag |
| | 160 | if tag == self.in_skip_tag: |
| | 161 | self.in_skip_tag = None |
| | 162 | |
| | 163 | def handle_data(self, data): |
| | 164 | # print "handle data: %r" % data |
| | 165 | if data and data.strip(" \n\t") and self.in_skip_tag is None: |
| | 166 | # data is not empty and we are not in a skip_tag area -> replace lexicon entries |
| | 167 | # call lexicon_data with the match object |
| | 168 | data = " %s " % data # work-a-round: http://www.python-forum.de/viewtopic.php?p=162915#162915 |
| | 169 | data = self.regex.sub(self.lexicon_data, data) |
| | 170 | data = data[1:-1] # work-a-round |
| | 171 | |
| | 172 | self.html += data |
| 69 | | import doctest |
| 70 | | doctest.testmod(verbose=False) |
| 71 | | print "DocTest end." |
| | 177 | import urllib2, time |
| | 178 | from pylucid_project.utils.diff import diff_lines |
| | 179 | |
| | 180 | # import doctest |
| | 181 | # doctest.testmod(verbose=False) |
| | 182 | # print "DocTest end." |
| | 183 | |
| | 184 | skip_tags = ('a', 'input', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'textarea', 'fieldset') |
| | 185 | |
| | 186 | class LexiconData(dict): |
| | 187 | def __call__(self, matchobject): |
| | 188 | word = matchobject.group(0) |
| | 189 | print "LexiconData.__call__: %r" % word |
| | 190 | term = self[word.lower()] |
| | 191 | return term + word + term |
| | 192 | |
| | 193 | lexicon_data = LexiconData({"foo bar": "1", "foo": "2", "bar": "3"}) |
| | 194 | # lexicon_data = LexiconData({"match on nothing!": None}) |
| | 195 | |
| | 196 | # url = "http://www.pylucid.org" |
| | 197 | # url = "http://www.google.com" |
| | 198 | # url = "http://www.python.org" |
| | 199 | # url = "http://www.heise.de" |
| | 200 | # url = "http://www.facebook.com" |
| | 201 | # |
| | 202 | # print "request %r..." % url, |
| | 203 | # f = urllib2.urlopen(url) |
| | 204 | # html = f.read() |
| | 205 | # f.close() |
| | 206 | # print "OK" |
| | 207 | |
| | 208 | # html = '<a href="foo" title="here "is a problem" fuck">bar</a>' |
| | 209 | html = ''' |
| | 210 | <html><p><a href="Foo Bar"><strong>Foo Bar</strong> Bar</a> |
| | 211 | one Foo Bar two FOO three BaR four |
| | 212 | Here not: Fooo or XbarX |
| | 213 | </p></html> |
| | 214 | ''' |
| | 215 | |
| | 216 | start_time = time.time() |
| | 217 | s = SubHtml(lexicon_data, skip_tags=["a"]) |
| | 218 | s.feed(html) |
| | 219 | s.close() |
| | 220 | print "+++ duration: %.3fsec" % (time.time() - start_time) |
| | 221 | print diff_lines(html, s.html) |
| | 222 | |
| | 223 | # print "-" * 79 |
| | 224 | # print html |
| | 225 | # print "-" * 79 |
| | 226 | # print p.html |
| | 227 | # print "-" * 79 |
| | 228 | |
| | 229 | |