geRSSicht

annotate src/heisefeed.py @ 83:8ee7f00c0819

Fixed a stupid ATOM-feed issue that Telepolis introduced Now one can parse the Telepolis feed again
author Tobias Mueller (meatbox) <muelli@cryptobitch.de>
date Fri, 15 Apr 2011 15:54:36 +0200
parents 1c70c01d8522
children
rev   line source
muelli@6 1 #!/usr/bin/env python
muelli@6 2 # -*- coding: utf-8 -*-
muelli@6 3 """
muelli@19 4 A simple module which parses a website and writes an ATOM file.
muelli@6 5 """
muelli@6 6
muelli@6 7 import monkeypatch
muelli@6 8
muelli@6 9 import datetime
muelli@6 10 import htmlentitydefs
muelli@40 11 import logging
muelli@19 12 import os
muelli@79 13 import re
muelli@7 14 import urllib2
muelli@6 15 import xml.etree.cElementTree as ET
muelli@36 16 from xml.parsers.expat import ExpatError
muelli@6 17
muelli@30 18 from Lazy import Lazy
muelli@6 19 import xmlobject
muelli@6 20 from deps.E import E
muelli@6 21
muelli@6 22
muelli@6 23 def format_date(date):
muelli@6 24 return date.isoformat()
muelli@6 25
muelli@6 26 class Logged():
muelli@6 27 def __init__(self, func):
muelli@6 28 self.func = func
muelli@6 29 def __call__(self):
muelli@6 30 pass
muelli@6 31
muelli@19 32 class Meldung(object):
muelli@71 33 TMPDIR = '/tmp/heise-atom'
muelli@71 34 TMPPATTERN_LINES = '%s/'% TMPDIR + 'heise-%d.html'
muelli@71 35 TMPPATTERN_TEXT = '%s/'% TMPDIR + 'heise-%d-text.html'
muelli@71 36
muelli@6 37 def __init__(self):
muelli@19 38 self.time = None
muelli@19 39 self.heisenr = None
muelli@6 40 self.link = None
muelli@19 41 self.abstract = None
muelli@19 42 self.title = None
muelli@74 43 #self.text = Lazy(self.get_heise_text) # Ah crap. How does one lazily evaluate stuff in Python?
muelli@71 44 self.log = logging.getLogger('HeiseMeldung')
muelli@6 45
muelli@6 46 def __unicode__(self):
muelli@19 47 return u"<Meldung %s from %s>" % (self.heisenr, self.time)
muelli@6 48 def __str__(self):
muelli@19 49 return "<Meldung %s from %s>" % (self.heisenr, self.time)
muelli@76 50
muelli@71 51 def _get_heise_text_lines_cached( self ):
muelli@19 52 TMPDIR = self.TMPDIR
muelli@71 53 nr = self.heisenr
muelli@23 54 cache_fname = self.TMPPATTERN_LINES % ( nr, )
muelli@23 55 try:
muelli@76 56 data = [line.decode('utf-8')
muelli@76 57 for line in file(cache_fname, 'r').readlines()]
muelli@19 58 self.log.debug('Cache HIT! %d', nr)
muelli@19 59 except IOError, e:
muelli@71 60 data = self._get_heise_text_lines_noncached()
muelli@19 61 self.log.debug('Cache MISS %d', nr)
muelli@24 62 try:
muelli@24 63 os.makedirs(TMPDIR, 0700)
muelli@24 64 except OSError, e:
muelli@24 65 if not e.errno == 17:
muelli@24 66 raise
muelli@24 67
muelli@77 68 file(self.TMPPATTERN_LINES % (nr, ), 'w' ).write(
muelli@77 69 os.linesep.join(
muelli@77 70 [line.encode('utf-8') for line in data]
muelli@77 71 )
muelli@77 72 )
muelli@24 73
muelli@19 74 return data
muelli@19 75
muelli@71 76 def _get_heise_text_lines_noncached( self ):
muelli@71 77 url = self.link
muelli@71 78 self.log.debug('Trying to fetch and save %s as %d', url, self.heisenr)
muelli@22 79 lines = [line.decode('utf-8') for line in urllib2.urlopen(url).readlines()]
muelli@24 80
muelli@19 81 return lines
muelli@21 82
muelli@71 83 def _get_heise_text_cached(self):
muelli@71 84 nr = self.heisenr
muelli@21 85 cache_fname = self.TMPPATTERN_TEXT % ( nr, )
muelli@21 86 try:
muelli@21 87 data = file(cache_fname, 'r').read().decode('utf-8')
muelli@21 88 self.log.debug('Text Cache HIT! %d', nr)
muelli@21 89 assert len(data) > 0
muelli@21 90 except IOError, e:
muelli@71 91 data = self._get_heise_text_noncached()
muelli@21 92 self.log.debug('Text Cache MISS %d', nr)
muelli@22 93 file(cache_fname, 'w').write(data.encode('utf-8'))
muelli@76 94
muelli@21 95 return data
muelli@21 96
muelli@76 97
muelli@71 98 def _get_heise_text_noncached(self):
muelli@19 99 #return "notext"
muelli@71 100 lines = self._get_heise_text_lines_cached()
muelli@19 101 self.log.debug('Read %d lines', len(lines))
muelli@75 102 start = [nr for (nr, line) in enumerate( lines )
muelli@75 103 if line.strip().startswith('<div id="artikel">')]
muelli@19 104 if not len(start) == 1:
muelli@19 105 self.log.warn('No Start (%d) of Meldung found in %s', len(start), lines)
muelli@19 106 return "No start found in \n\n %s" % "\n".join(lines)
muelli@19 107 start = start[0]
muelli@75 108 end = [nr for (nr, line) in enumerate( lines[start:] )
muelli@75 109 if line.strip().startswith('</div>')]
muelli@19 110 if not len(end) >= 1:
muelli@19 111 self.log.warn('No end (%d) of Meldung found in %s', len(end), lines)
muelli@19 112 return "No end found in \n\n %s" % "\n".join(lines)
muelli@19 113 end = end[0]
muelli@25 114 text = unicode("\n".join( lines[start:start+end] ))
muelli@50 115 self.log.debug('Found Text for %d between %d and %d: %s',
muelli@50 116 nr, start, end, text)
muelli@19 117 return text
muelli@19 118
muelli@73 119 def get_heise_text(self):
muelli@73 120 text = self._get_heise_text_cached()
muelli@21 121
muelli@21 122 return text
muelli@74 123
muelli@74 124 @Lazy
muelli@74 125 def text(self):
muelli@74 126 return self.get_heise_text()
muelli@71 127
muelli@71 128 class HeiseParserXMLObject(object):
muelli@71 129 BASE = "http://www.heise.de/newsticker/meldung/%s"
muelli@71 130 #BASE = "http://www.heise.de/newsticker/meldung/%s?view=print"
muelli@71 131 BASE = 'http://heise-online.mobi/%s'
muelli@71 132
muelli@71 133
muelli@71 134 def __init__(self, *args, **kwargs):
muelli@71 135 self.log = logging.getLogger('ParserXMLObject')
muelli@71 136 self.meldungen = []
muelli@71 137
muelli@71 138
muelli@71 139
muelli@19 140 def feed(self, xml):
muelli@79 141 heisestring = unicode(xml).replace('\n', '').replace('\t', '')
muelli@79 142
muelli@79 143 artikel_xml = ""
muelli@79 144 heise_regex = """(<div id="news_([0-9]+)".*?class="artikel">.*?</div>)"""
muelli@79 145 for artikel, nr in re.findall(heise_regex, heisestring):
muelli@79 146 artikel_xml += "%s\n" % artikel
muelli@79 147
muelli@19 148
muelli@79 149 heise_xml = '<?xml version="1.0" encoding="utf-8"?>'
muelli@79 150 xmlo_raw = u"%s\n<news>%s</news>" % (heise_xml, artikel_xml)
muelli@79 151
muelli@79 152 ## Fix XHTML Errors
muelli@79 153 broken = (
muelli@60 154 ' & ',
muelli@57 155 '1&1',
muelli@54 156 'AT&T',
muelli@54 157 'PG&E',
muelli@54 158 'Command&Control-Servers',
muelli@58 159 'Giesecke & Devrient',
muelli@59 160 'Technology & Operations',
muelli@59 161 'a&o',
muelli@62 162 'S&P',
muelli@64 163 'Look&Feel'
muelli@65 164 'ES&S',
muelli@79 165 )
muelli@79 166 for chunk in broken:
muelli@79 167 xmlo_raw = xmlo_raw.replace('%s' % chunk,
muelli@55 168 '%s' % chunk.replace('&', '&amp;'))
muelli@51 169
muelli@79 170 self.log.debug('trying to create XML Object from\n%s\n%s',
muelli@47 171 str(type(xmlo_raw)), xmlo_raw)
muelli@79 172 try:
muelli@79 173 xmlo = xmlobject.XMLFile( raw = xmlo_raw.encode('utf-8'))
muelli@79 174 except ExpatError, e:
muelli@79 175 self.log.critical("Couldn't create XMLObject from\n%s",
muelli@79 176 xmlo_raw)
muelli@56 177
muelli@79 178 if hasattr(e, 'lineno'):
muelli@79 179 offending_lineno = e.lineno
muelli@79 180 xml_lines = xmlo_raw.splitlines()
muelli@79 181 offending_line = xml_lines[offending_lineno-1]
muelli@79 182 self.log.critical("Offending line: %d\n%s",
muelli@79 183 offending_lineno,
muelli@79 184 offending_line)
muelli@56 185
muelli@79 186 offending_char = e.offset
muelli@79 187 first_space, last_space = offending_line.rfind(' ', 0, offending_char), offending_line.find(' ', offending_char)
muelli@79 188 if first_space == -1: first_space = 0
muelli@79 189 if last_space == -1: last_space = -1
muelli@56 190
muelli@79 191 offending_word = offending_line[first_space:last_space]
muelli@79 192 self.log.critical("Offending word:\n%s",
muelli@79 193 offending_word)
muelli@79 194 raise e
muelli@19 195
muelli@19 196
muelli@79 197 try:
muelli@79 198 meldungen = xmlo.root.div
muelli@79 199 except (AttributeError, ), e:
muelli@79 200 self.log.critical("Error getting a meldung from\n%s", xmlo_raw)
muelli@79 201 raise
muelli@19 202
muelli@79 203 for meldung in meldungen:
muelli@79 204 m = Meldung()
muelli@79 205 heisenr = int(meldung.id.strip('news_'))
muelli@79 206 m.heisenr = heisenr
muelli@79 207
muelli@79 208 datum_raw = meldung.span._text
muelli@79 209 datum = datetime.datetime.strptime(datum_raw,
muelli@79 210 '%d.%m.%Y - %H:%M Uhr')
muelli@79 211 m.time = datum
muelli@79 212
muelli@79 213 title_raw = meldung.h3.a._text
muelli@79 214 m.title = title_raw
muelli@79 215
muelli@79 216 abstract = meldung.p._text
muelli@79 217 m.abstract = abstract
muelli@79 218
muelli@79 219
muelli@79 220 suffix = meldung.h3.a.href
muelli@79 221 #link = self.BASE % suffix[len('/news/'):] #Heise ist doof: Die Heisenummern in den URLs fuer den normalen Newsticker sind um 1 kleiner als die fuer die mobil version
muelli@79 222 m.link = self.BASE % suffix
muelli@79 223
muelli@79 224 self.log.info('appending meldung: %s', m)
muelli@79 225 self.meldungen.append(m)
muelli@41 226
muelli@45 227 return self.meldungen
muelli@19 228
muelli@19 229
muelli@19 230 HeiseParser = HeiseParserXMLObject
muelli@19 231
muelli@19 232 class HeiseFeedParser(object):
muelli@6 233 def __init__(self):
muelli@21 234 self.log = logging.getLogger('FeedParser')
muelli@19 235 self.meldungen = []
muelli@6 236
muelli@66 237 def fetch(self, url="http://heise-online.mobi/?seite=%d", index=0):
muelli@19 238 url = url % index
muelli@20 239 buf = urllib2.urlopen(url).read().decode('utf-8')
muelli@83 240 self.log.debug('Fetched %s', buf)
muelli@9 241 return buf
muelli@9 242
muelli@9 243 def fetch_and_parse(self):
muelli@9 244 buf = self.fetch()
muelli@9 245 return self.parse(buf)
muelli@19 246
muelli@19 247 def fetch_many_and_parse(self, count=5):
muelli@42 248 ret = []
muelli@80 249 for i in xrange(0, count+1):
muelli@42 250 buf = self.fetch( index=i )
muelli@42 251 ret += self.parse( buf )
muelli@19 252 return ret
muelli@19 253
muelli@9 254
muelli@6 255 def parse(self, html):
muelli@6 256 """Parses a html string and returns a list of parsed cases
muelli@6 257 """
muelli@6 258 self.log.debug(u'Calling parse() with locals: %s', locals())
muelli@6 259
muelli@19 260 p = HeiseParser()
muelli@6 261 p.feed(html)
muelli@19 262 self.meldungen.extend( p.meldungen )
muelli@19 263 return p.meldungen
muelli@6 264
muelli@6 265 def get_atom_entries(self):
muelli@6 266 entries = []
muelli@21 267 for (nr, meldung) in enumerate(self.meldungen):
muelli@21 268 self.log.debug('Trying to append meldung %s (%d/%d)', meldung, nr+1, len(self.meldungen))
muelli@6 269 entries.append(E.entry(
muelli@13 270 E.link(
muelli@19 271 href=meldung.link,
muelli@19 272 title=meldung.title,
muelli@13 273 ),
muelli@19 274 # E.author(
muelli@19 275 # E.name("Heise"),
muelli@19 276 # ),
muelli@19 277 E.published(format_date(meldung.time)),
muelli@19 278 E.updated(format_date(meldung.time)),
muelli@70 279 E.id("%d" % meldung.heisenr),
muelli@19 280 E.title(meldung.title),
muelli@19 281 E.summary(meldung.abstract),
muelli@27 282 # E.content(meldung.text,
muelli@27 283 # type="text/html"),
muelli@6 284 )
muelli@6 285 )
muelli@6 286 return entries
muelli@6 287
muelli@6 288 def get_latest_update(self):
muelli@6 289 dates = []
muelli@19 290 for meldung in self.meldungen:
muelli@19 291 dates.append(meldung.time)
muelli@6 292 return max(dates)
muelli@6 293
muelli@6 294 def to_atom(self):
muelli@6 295 atom_feed = (E.feed(
muelli@19 296 E.author(
muelli@19 297 E.name("heise online"),
muelli@19 298 ),
muelli@19 299 E.title("heise online News"),
muelli@19 300 E.subtitle("Nachrichten nicht nur aus der Welt der Computer"),
muelli@19 301 E.icon("http://heise-online.mobi//icons/logo.gif"),
muelli@19 302 E.id("http://www.heise.de/newsticker/"),
muelli@6 303 E.updated (format_date(self.get_latest_update())),
muelli@6 304 E.generator("Muellis ETree ATOM Generator"),
muelli@6 305
muelli@6 306 xmlns="http://www.w3.org/2005/Atom",
muelli@6 307 *self.get_atom_entries()
muelli@6 308 )
muelli@6 309 )
muelli@6 310 xmlstr = ET.tostring(atom_feed)
muelli@20 311 xmlheader = u'<?xml version="1.0" encoding="utf-8" ?>\n'
muelli@10 312 return xmlheader + xmlstr
muelli@10 313
muelli@10 314 def to_atom_file(self, fname):
muelli@10 315 f = file(fname, "w")
muelli@20 316 f.write(self.to_atom().encode('utf-8'))
muelli@10 317 f.close()
muelli@10 318
muelli@10 319 if __name__ == "__main__":
muelli@40 320 from optparse import OptionParser
muelli@78 321 parser = OptionParser("usage: %prog [options] filename")
muelli@40 322 parser.add_option("-l", "--loglevel", dest="loglevel",
muelli@40 323 help="Sets the loglevel to one of debug, info, warn,"
muelli@43 324 " error, critical", default="error")
muelli@40 325 # parser.add_option("-q", "--quiet",
muelli@40 326 # action="store_false", dest="verbose", default=True,
muelli@40 327 # help="don't print status messages to stdout")
muelli@40 328 (options, args) = parser.parse_args()
muelli@40 329 loglevel = {'debug': logging.DEBUG, 'info': logging.INFO,
muelli@40 330 'warn': logging.WARN, 'error': logging.ERROR,
muelli@43 331 'critical': logging.CRITICAL}.get(options.loglevel, "error")
muelli@40 332 logging.basicConfig(level=loglevel)
muelli@40 333 log = logging.getLogger("HeiseClients Main")
muelli@40 334
muelli@40 335 if len(args) > 0:
muelli@40 336 path = args[0]
muelli@10 337 else:
muelli@19 338 path = "heise-atom.xml"
muelli@10 339
muelli@19 340 HFP = HeiseFeedParser()
muelli@19 341 items = HFP.fetch_many_and_parse()
muelli@10 342 assert(len(items)>0)
muelli@52 343 HFP.to_atom_file(path)