- Timestamp:
- 06/05/08 16:20:04 (22 months ago)
- Files:
-
- 1 modified
Legend:
- Unmodified
- Added
- Removed
-
trunk/pylucid/PyLucid/plugins_internal/IncludeRemote/IncludeRemote.py
r1634 r1639 20 20 21 21 22 import socket, urllib2, re, time22 import socket, urllib2, cgi, re, time 23 23 24 24 socket.setdefaulttimeout(5) # set a timeout … … 29 29 from PyLucid.system.BasePlugin import PyLucidBasePlugin 30 30 31 STRIP_CONTENT = ( 32 # stripe stylesheet links 33 re.compile('(<link.*?rel.*?stylesheet.*?>)(?is)'), 34 # strip javascript 35 re.compile("(<script.*?</script>)(?is)") 36 ) 37 38 META_CHARSET = re.compile('<meta.*?charset=(.*?)"') 39 BODY_RE = re.compile("<body.*?>(.*?)</body>(?is)") 31 40 32 41 class IncludeRemote(PyLucidBasePlugin): 33 42 34 def lucidTag(self, url, title=None, escape=True): 35 43 def lucidTag(self, url, title=None, preformat=None, escape=True): 44 """ 45 docu about the method args, look into config file! 46 """ 36 47 # get the remote content 37 48 start_time = time.time() … … 40 51 content = f.read() 41 52 f.close() 42 except Exception, e :43 return (44 "<p>IncludeRemote error! Can't get '%s'<br />"45 " error:'%s'</p>"46 ) % (url, e) 53 except Exception, err: 54 if self.request.debug: 55 self.page_msg.red("Include remote '%s' error: %s" % (url, err)) 56 return "[error getting '%s'.]" % title 57 47 58 duration_time = time.time() - start_time 48 59 60 #______________________________________________________________________ 61 # GET HEADER INFO 49 62 50 # cutout stylesheets 51 try: 52 content = re.sub( 53 '(<link.*?rel.*?stylesheet.*?>)(?is)',"",content 54 ) 55 except: 56 pass 63 content_type = None 64 content_encodings = [] 65 66 # detect content type and encoding 67 raw_content_type = f.headers.get("content-type") 68 if raw_content_type: 69 content_type, params = cgi.parse_header(raw_content_type) 70 if "charset" in params: 71 content_encodings.append(params["charset"]) 57 72 58 73 59 # cutout JavaScripts74 # Try to get content charset from html meta info 60 75 try: 61 content = re.sub( 62 '(<script.*?</script>)(?is)',"",content 63 ) 64 except: 76 charset = META_CHARSET.findall(content.lower())[0] 77 except IndexError: 65 78 pass 66 79 except Exception, err: 80 if self.request.debug: 81 self.page_msg.red("Error get content charset:", err) 82 else: 83 content_encodings.append(charset) 67 84 68 85 # decode into unicode 69 try: 70 charset = re.findall( 71 '<meta.*?content-type.*?charset=(.*?)"', content.lower() 72 )[0] 73 content = content.decode(charset) 74 except: 75 content = smart_unicode(content, errors='replace') 86 content = self._decode_content(content, content_encodings) 76 87 88 #______________________________________________________________________ 77 89 78 90 # try to cut out only the body content 79 91 try: 80 content = re.findall("<body.*?>(.*?)</body>(?is)",content)[0]81 except :92 content = BODY_RE.findall(content)[0] 93 except IndexError: 82 94 pass 95 except Exception, err: 96 if self.request.debug: 97 self.page_msg.red("Error strip body content:", err) 83 98 99 #______________________________________________________________________ 100 # Strip content 101 102 for regex in STRIP_CONTENT: 103 try: 104 content = regex.sub(u"",content) 105 except Exception, err: 106 if self.request.debug: 107 self.page_msg.red("Error strip content:", err) 108 109 #______________________________________________________________________ 84 110 85 111 if not escape: … … 87 113 content = mark_safe(content) 88 114 115 # setup preformat 116 if preformat==None and "html" in content_type: 117 preformat = False 118 else: 119 preformat = True 89 120 90 121 context = { … … 93 124 "title": title, 94 125 "content": content, 126 "preformat": preformat, 95 127 } 96 128 self._render_template("IncludeRemote", context)#, debug=True) 129 130 def _decode_content(self, content, content_encodings): 131 """ 132 Try to decode content into unicode with the given encoding list. 133 """ 134 if not content_encodings: 135 # No charset found. 136 return smart_unicode(content, errors='replace') 137 138 errors = [] 139 for charset in content_encodings: 140 try: 141 content = content.decode(charset) 142 except UnicodeDecodeError, err: 143 errors.append(err) 144 else: 145 return content 146 147 if self.request.debug: 148 self.page_msg.red("Can't decode content:", errors) 149 150 return smart_unicode(content, errors='replace')