geRSSicht
view src/heisefeed.py @ 83:8ee7f00c0819
Fixed a stupid ATOM-feed issue that Telepolis introduced
Now one can parse the Telepolis feed again
Now one can parse the Telepolis feed again
| author | Tobias Mueller (meatbox) <muelli@cryptobitch.de> |
|---|---|
| date | Fri Apr 15 15:54:36 2011 +0200 (9 months ago) |
| parents | 1c70c01d8522 |
| children |
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 A simple module which parses a website and writes an ATOM file.
5 """
7 import monkeypatch
9 import datetime
10 import htmlentitydefs
11 import logging
12 import os
13 import re
14 import urllib2
15 import xml.etree.cElementTree as ET
16 from xml.parsers.expat import ExpatError
18 from Lazy import Lazy
19 import xmlobject
20 from deps.E import E
23 def format_date(date):
24 return date.isoformat()
26 class Logged():
27 def __init__(self, func):
28 self.func = func
29 def __call__(self):
30 pass
32 class Meldung(object):
33 TMPDIR = '/tmp/heise-atom'
34 TMPPATTERN_LINES = '%s/'% TMPDIR + 'heise-%d.html'
35 TMPPATTERN_TEXT = '%s/'% TMPDIR + 'heise-%d-text.html'
37 def __init__(self):
38 self.time = None
39 self.heisenr = None
40 self.link = None
41 self.abstract = None
42 self.title = None
43 #self.text = Lazy(self.get_heise_text) # Ah crap. How does one lazily evaluate stuff in Python?
44 self.log = logging.getLogger('HeiseMeldung')
46 def __unicode__(self):
47 return u"<Meldung %s from %s>" % (self.heisenr, self.time)
48 def __str__(self):
49 return "<Meldung %s from %s>" % (self.heisenr, self.time)
51 def _get_heise_text_lines_cached( self ):
52 TMPDIR = self.TMPDIR
53 nr = self.heisenr
54 cache_fname = self.TMPPATTERN_LINES % ( nr, )
55 try:
56 data = [line.decode('utf-8')
57 for line in file(cache_fname, 'r').readlines()]
58 self.log.debug('Cache HIT! %d', nr)
59 except IOError, e:
60 data = self._get_heise_text_lines_noncached()
61 self.log.debug('Cache MISS %d', nr)
62 try:
63 os.makedirs(TMPDIR, 0700)
64 except OSError, e:
65 if not e.errno == 17:
66 raise
68 file(self.TMPPATTERN_LINES % (nr, ), 'w' ).write(
69 os.linesep.join(
70 [line.encode('utf-8') for line in data]
71 )
72 )
74 return data
76 def _get_heise_text_lines_noncached( self ):
77 url = self.link
78 self.log.debug('Trying to fetch and save %s as %d', url, self.heisenr)
79 lines = [line.decode('utf-8') for line in urllib2.urlopen(url).readlines()]
81 return lines
83 def _get_heise_text_cached(self):
84 nr = self.heisenr
85 cache_fname = self.TMPPATTERN_TEXT % ( nr, )
86 try:
87 data = file(cache_fname, 'r').read().decode('utf-8')
88 self.log.debug('Text Cache HIT! %d', nr)
89 assert len(data) > 0
90 except IOError, e:
91 data = self._get_heise_text_noncached()
92 self.log.debug('Text Cache MISS %d', nr)
93 file(cache_fname, 'w').write(data.encode('utf-8'))
95 return data
98 def _get_heise_text_noncached(self):
99 #return "notext"
100 lines = self._get_heise_text_lines_cached()
101 self.log.debug('Read %d lines', len(lines))
102 start = [nr for (nr, line) in enumerate( lines )
103 if line.strip().startswith('<div id="artikel">')]
104 if not len(start) == 1:
105 self.log.warn('No Start (%d) of Meldung found in %s', len(start), lines)
106 return "No start found in \n\n %s" % "\n".join(lines)
107 start = start[0]
108 end = [nr for (nr, line) in enumerate( lines[start:] )
109 if line.strip().startswith('</div>')]
110 if not len(end) >= 1:
111 self.log.warn('No end (%d) of Meldung found in %s', len(end), lines)
112 return "No end found in \n\n %s" % "\n".join(lines)
113 end = end[0]
114 text = unicode("\n".join( lines[start:start+end] ))
115 self.log.debug('Found Text for %d between %d and %d: %s',
116 nr, start, end, text)
117 return text
119 def get_heise_text(self):
120 text = self._get_heise_text_cached()
122 return text
124 @Lazy
125 def text(self):
126 return self.get_heise_text()
128 class HeiseParserXMLObject(object):
129 BASE = "http://www.heise.de/newsticker/meldung/%s"
130 #BASE = "http://www.heise.de/newsticker/meldung/%s?view=print"
131 BASE = 'http://heise-online.mobi/%s'
134 def __init__(self, *args, **kwargs):
135 self.log = logging.getLogger('ParserXMLObject')
136 self.meldungen = []
140 def feed(self, xml):
141 heisestring = unicode(xml).replace('\n', '').replace('\t', '')
143 artikel_xml = ""
144 heise_regex = """(<div id="news_([0-9]+)".*?class="artikel">.*?</div>)"""
145 for artikel, nr in re.findall(heise_regex, heisestring):
146 artikel_xml += "%s\n" % artikel
149 heise_xml = '<?xml version="1.0" encoding="utf-8"?>'
150 xmlo_raw = u"%s\n<news>%s</news>" % (heise_xml, artikel_xml)
152 ## Fix XHTML Errors
153 broken = (
154 ' & ',
155 '1&1',
156 'AT&T',
157 'PG&E',
158 'Command&Control-Servers',
159 'Giesecke & Devrient',
160 'Technology & Operations',
161 'a&o',
162 'S&P',
163 'Look&Feel'
164 'ES&S',
165 )
166 for chunk in broken:
167 xmlo_raw = xmlo_raw.replace('%s' % chunk,
168 '%s' % chunk.replace('&', '&'))
170 self.log.debug('trying to create XML Object from\n%s\n%s',
171 str(type(xmlo_raw)), xmlo_raw)
172 try:
173 xmlo = xmlobject.XMLFile( raw = xmlo_raw.encode('utf-8'))
174 except ExpatError, e:
175 self.log.critical("Couldn't create XMLObject from\n%s",
176 xmlo_raw)
178 if hasattr(e, 'lineno'):
179 offending_lineno = e.lineno
180 xml_lines = xmlo_raw.splitlines()
181 offending_line = xml_lines[offending_lineno-1]
182 self.log.critical("Offending line: %d\n%s",
183 offending_lineno,
184 offending_line)
186 offending_char = e.offset
187 first_space, last_space = offending_line.rfind(' ', 0, offending_char), offending_line.find(' ', offending_char)
188 if first_space == -1: first_space = 0
189 if last_space == -1: last_space = -1
191 offending_word = offending_line[first_space:last_space]
192 self.log.critical("Offending word:\n%s",
193 offending_word)
194 raise e
197 try:
198 meldungen = xmlo.root.div
199 except (AttributeError, ), e:
200 self.log.critical("Error getting a meldung from\n%s", xmlo_raw)
201 raise
203 for meldung in meldungen:
204 m = Meldung()
205 heisenr = int(meldung.id.strip('news_'))
206 m.heisenr = heisenr
208 datum_raw = meldung.span._text
209 datum = datetime.datetime.strptime(datum_raw,
210 '%d.%m.%Y - %H:%M Uhr')
211 m.time = datum
213 title_raw = meldung.h3.a._text
214 m.title = title_raw
216 abstract = meldung.p._text
217 m.abstract = abstract
220 suffix = meldung.h3.a.href
221 #link = self.BASE % suffix[len('/news/'):] #Heise ist doof: Die Heisenummern in den URLs fuer den normalen Newsticker sind um 1 kleiner als die fuer die mobil version
222 m.link = self.BASE % suffix
224 self.log.info('appending meldung: %s', m)
225 self.meldungen.append(m)
227 return self.meldungen
230 HeiseParser = HeiseParserXMLObject
232 class HeiseFeedParser(object):
233 def __init__(self):
234 self.log = logging.getLogger('FeedParser')
235 self.meldungen = []
237 def fetch(self, url="http://heise-online.mobi/?seite=%d", index=0):
238 url = url % index
239 buf = urllib2.urlopen(url).read().decode('utf-8')
240 self.log.debug('Fetched %s', buf)
241 return buf
243 def fetch_and_parse(self):
244 buf = self.fetch()
245 return self.parse(buf)
247 def fetch_many_and_parse(self, count=5):
248 ret = []
249 for i in xrange(0, count+1):
250 buf = self.fetch( index=i )
251 ret += self.parse( buf )
252 return ret
255 def parse(self, html):
256 """Parses a html string and returns a list of parsed cases
257 """
258 self.log.debug(u'Calling parse() with locals: %s', locals())
260 p = HeiseParser()
261 p.feed(html)
262 self.meldungen.extend( p.meldungen )
263 return p.meldungen
265 def get_atom_entries(self):
266 entries = []
267 for (nr, meldung) in enumerate(self.meldungen):
268 self.log.debug('Trying to append meldung %s (%d/%d)', meldung, nr+1, len(self.meldungen))
269 entries.append(E.entry(
270 E.link(
271 href=meldung.link,
272 title=meldung.title,
273 ),
274 # E.author(
275 # E.name("Heise"),
276 # ),
277 E.published(format_date(meldung.time)),
278 E.updated(format_date(meldung.time)),
279 E.id("%d" % meldung.heisenr),
280 E.title(meldung.title),
281 E.summary(meldung.abstract),
282 # E.content(meldung.text,
283 # type="text/html"),
284 )
285 )
286 return entries
288 def get_latest_update(self):
289 dates = []
290 for meldung in self.meldungen:
291 dates.append(meldung.time)
292 return max(dates)
294 def to_atom(self):
295 atom_feed = (E.feed(
296 E.author(
297 E.name("heise online"),
298 ),
299 E.title("heise online News"),
300 E.subtitle("Nachrichten nicht nur aus der Welt der Computer"),
301 E.icon("http://heise-online.mobi//icons/logo.gif"),
302 E.id("http://www.heise.de/newsticker/"),
303 E.updated (format_date(self.get_latest_update())),
304 E.generator("Muellis ETree ATOM Generator"),
306 xmlns="http://www.w3.org/2005/Atom",
307 *self.get_atom_entries()
308 )
309 )
310 xmlstr = ET.tostring(atom_feed)
311 xmlheader = u'<?xml version="1.0" encoding="utf-8" ?>\n'
312 return xmlheader + xmlstr
314 def to_atom_file(self, fname):
315 f = file(fname, "w")
316 f.write(self.to_atom().encode('utf-8'))
317 f.close()
319 if __name__ == "__main__":
320 from optparse import OptionParser
321 parser = OptionParser("usage: %prog [options] filename")
322 parser.add_option("-l", "--loglevel", dest="loglevel",
323 help="Sets the loglevel to one of debug, info, warn,"
324 " error, critical", default="error")
325 # parser.add_option("-q", "--quiet",
326 # action="store_false", dest="verbose", default=True,
327 # help="don't print status messages to stdout")
328 (options, args) = parser.parse_args()
329 loglevel = {'debug': logging.DEBUG, 'info': logging.INFO,
330 'warn': logging.WARN, 'error': logging.ERROR,
331 'critical': logging.CRITICAL}.get(options.loglevel, "error")
332 logging.basicConfig(level=loglevel)
333 log = logging.getLogger("HeiseClients Main")
335 if len(args) > 0:
336 path = args[0]
337 else:
338 path = "heise-atom.xml"
340 HFP = HeiseFeedParser()
341 items = HFP.fetch_many_and_parse()
342 assert(len(items)>0)
343 HFP.to_atom_file(path)
