geRSSicht

changeset 83:8ee7f00c0819 heise-feed tip

Fixed a stupid ATOM-feed issue that Telepolis introduced Now one can parse the Telepolis feed again
author Tobias Mueller (meatbox) <muelli@cryptobitch.de>
date Fri, 15 Apr 2011 15:54:36 +0200
parents 824525444374
children
files src/heisefeed.py src/telepolisfeed.py
diffstat 2 files changed, 8 insertions(+), 1 deletions(-) [+]
line diff
     1.1 --- a/src/heisefeed.py	Sat Jan 02 16:44:02 2010 +0100
     1.2 +++ b/src/heisefeed.py	Fri Apr 15 15:54:36 2011 +0200
     1.3 @@ -237,6 +237,7 @@
     1.4      def fetch(self, url="http://heise-online.mobi/?seite=%d", index=0):
     1.5          url = url % index
     1.6          buf = urllib2.urlopen(url).read().decode('utf-8')
     1.7 +        self.log.debug('Fetched %s', buf)
     1.8          return buf
     1.9  
    1.10      def fetch_and_parse(self):
     2.1 --- a/src/telepolisfeed.py	Sat Jan 02 16:44:02 2010 +0100
     2.2 +++ b/src/telepolisfeed.py	Fri Apr 15 15:54:36 2011 +0200
     2.3 @@ -94,7 +94,12 @@
     2.4          return text
     2.5  
     2.6      def feed(self, xml):
     2.7 -        xmlo = xmlobject.XMLFile( raw = xml.encode('utf-8'))
     2.8 +        token = '</xml>\n'
     2.9 +        if xml.endswith(token): # Hotfix for a stupid XML issue caused by a wrongly formatted ATOM feed
    2.10 +            xml = xml[:-len(token)]
    2.11 +        raw = xml.encode('utf-8')
    2.12 +        self.log.debug('Trying to feed %s', xml)        
    2.13 +        xmlo = xmlobject.XMLFile( raw = raw)
    2.14          
    2.15          SUFFIX = "/1.html"
    2.16          LENGTH = 5
    2.17 @@ -128,6 +133,7 @@
    2.18  
    2.19      def to_atom(self):
    2.20          html = self.fetch()
    2.21 +        self.log.debug('fetched html: %s', html)
    2.22          p = TelepolisParser()
    2.23          atom = p.feed(html)
    2.24          return atom