geRSSicht

view src/heisefeed.py @ 82:824525444374

handle 0 entries
author muelli@bigbox <muelli@auftrags-killer.org>
date Sat Jan 02 16:44:02 2010 +0100 (6 months ago)
parents 38433a1cc6ed
children
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 A simple module which parses a website and writes an ATOM file.
5 """
7 import monkeypatch
9 import datetime
10 import htmlentitydefs
11 import logging
12 import os
13 import re
14 import urllib2
15 import xml.etree.cElementTree as ET
16 from xml.parsers.expat import ExpatError
18 from Lazy import Lazy
19 import xmlobject
20 from deps.E import E
23 def format_date(date):
24 return date.isoformat()
26 class Logged():
27 def __init__(self, func):
28 self.func = func
29 def __call__(self):
30 pass
32 class Meldung(object):
33 TMPDIR = '/tmp/heise-atom'
34 TMPPATTERN_LINES = '%s/'% TMPDIR + 'heise-%d.html'
35 TMPPATTERN_TEXT = '%s/'% TMPDIR + 'heise-%d-text.html'
37 def __init__(self):
38 self.time = None
39 self.heisenr = None
40 self.link = None
41 self.abstract = None
42 self.title = None
43 #self.text = Lazy(self.get_heise_text) # Ah crap. How does one lazily evaluate stuff in Python?
44 self.log = logging.getLogger('HeiseMeldung')
46 def __unicode__(self):
47 return u"<Meldung %s from %s>" % (self.heisenr, self.time)
48 def __str__(self):
49 return "<Meldung %s from %s>" % (self.heisenr, self.time)
51 def _get_heise_text_lines_cached( self ):
52 TMPDIR = self.TMPDIR
53 nr = self.heisenr
54 cache_fname = self.TMPPATTERN_LINES % ( nr, )
55 try:
56 data = [line.decode('utf-8')
57 for line in file(cache_fname, 'r').readlines()]
58 self.log.debug('Cache HIT! %d', nr)
59 except IOError, e:
60 data = self._get_heise_text_lines_noncached()
61 self.log.debug('Cache MISS %d', nr)
62 try:
63 os.makedirs(TMPDIR, 0700)
64 except OSError, e:
65 if not e.errno == 17:
66 raise
68 file(self.TMPPATTERN_LINES % (nr, ), 'w' ).write(
69 os.linesep.join(
70 [line.encode('utf-8') for line in data]
71 )
72 )
74 return data
76 def _get_heise_text_lines_noncached( self ):
77 url = self.link
78 self.log.debug('Trying to fetch and save %s as %d', url, self.heisenr)
79 lines = [line.decode('utf-8') for line in urllib2.urlopen(url).readlines()]
81 return lines
83 def _get_heise_text_cached(self):
84 nr = self.heisenr
85 cache_fname = self.TMPPATTERN_TEXT % ( nr, )
86 try:
87 data = file(cache_fname, 'r').read().decode('utf-8')
88 self.log.debug('Text Cache HIT! %d', nr)
89 assert len(data) > 0
90 except IOError, e:
91 data = self._get_heise_text_noncached()
92 self.log.debug('Text Cache MISS %d', nr)
93 file(cache_fname, 'w').write(data.encode('utf-8'))
95 return data
98 def _get_heise_text_noncached(self):
99 #return "notext"
100 lines = self._get_heise_text_lines_cached()
101 self.log.debug('Read %d lines', len(lines))
102 start = [nr for (nr, line) in enumerate( lines )
103 if line.strip().startswith('<div id="artikel">')]
104 if not len(start) == 1:
105 self.log.warn('No Start (%d) of Meldung found in %s', len(start), lines)
106 return "No start found in \n\n %s" % "\n".join(lines)
107 start = start[0]
108 end = [nr for (nr, line) in enumerate( lines[start:] )
109 if line.strip().startswith('</div>')]
110 if not len(end) >= 1:
111 self.log.warn('No end (%d) of Meldung found in %s', len(end), lines)
112 return "No end found in \n\n %s" % "\n".join(lines)
113 end = end[0]
114 text = unicode("\n".join( lines[start:start+end] ))
115 self.log.debug('Found Text for %d between %d and %d: %s',
116 nr, start, end, text)
117 return text
119 def get_heise_text(self):
120 text = self._get_heise_text_cached()
122 return text
124 @Lazy
125 def text(self):
126 return self.get_heise_text()
128 class HeiseParserXMLObject(object):
129 BASE = "http://www.heise.de/newsticker/meldung/%s"
130 #BASE = "http://www.heise.de/newsticker/meldung/%s?view=print"
131 BASE = 'http://heise-online.mobi/%s'
134 def __init__(self, *args, **kwargs):
135 self.log = logging.getLogger('ParserXMLObject')
136 self.meldungen = []
140 def feed(self, xml):
141 heisestring = unicode(xml).replace('\n', '').replace('\t', '')
143 artikel_xml = ""
144 heise_regex = """(<div id="news_([0-9]+)".*?class="artikel">.*?</div>)"""
145 for artikel, nr in re.findall(heise_regex, heisestring):
146 artikel_xml += "%s\n" % artikel
149 heise_xml = '<?xml version="1.0" encoding="utf-8"?>'
150 xmlo_raw = u"%s\n<news>%s</news>" % (heise_xml, artikel_xml)
152 ## Fix XHTML Errors
153 broken = (
154 ' & ',
155 '1&1',
156 'AT&T',
157 'PG&E',
158 'Command&Control-Servers',
159 'Giesecke & Devrient',
160 'Technology & Operations',
161 'a&o',
162 'S&P',
163 'Look&Feel'
164 'ES&S',
165 )
166 for chunk in broken:
167 xmlo_raw = xmlo_raw.replace('%s' % chunk,
168 '%s' % chunk.replace('&', '&amp;'))
170 self.log.debug('trying to create XML Object from\n%s\n%s',
171 str(type(xmlo_raw)), xmlo_raw)
172 try:
173 xmlo = xmlobject.XMLFile( raw = xmlo_raw.encode('utf-8'))
174 except ExpatError, e:
175 self.log.critical("Couldn't create XMLObject from\n%s",
176 xmlo_raw)
178 if hasattr(e, 'lineno'):
179 offending_lineno = e.lineno
180 xml_lines = xmlo_raw.splitlines()
181 offending_line = xml_lines[offending_lineno-1]
182 self.log.critical("Offending line: %d\n%s",
183 offending_lineno,
184 offending_line)
186 offending_char = e.offset
187 first_space, last_space = offending_line.rfind(' ', 0, offending_char), offending_line.find(' ', offending_char)
188 if first_space == -1: first_space = 0
189 if last_space == -1: last_space = -1
191 offending_word = offending_line[first_space:last_space]
192 self.log.critical("Offending word:\n%s",
193 offending_word)
194 raise e
197 try:
198 meldungen = xmlo.root.div
199 except (AttributeError, ), e:
200 self.log.critical("Error getting a meldung from\n%s", xmlo_raw)
201 raise
203 for meldung in meldungen:
204 m = Meldung()
205 heisenr = int(meldung.id.strip('news_'))
206 m.heisenr = heisenr
208 datum_raw = meldung.span._text
209 datum = datetime.datetime.strptime(datum_raw,
210 '%d.%m.%Y - %H:%M Uhr')
211 m.time = datum
213 title_raw = meldung.h3.a._text
214 m.title = title_raw
216 abstract = meldung.p._text
217 m.abstract = abstract
220 suffix = meldung.h3.a.href
221 #link = self.BASE % suffix[len('/news/'):] #Heise ist doof: Die Heisenummern in den URLs fuer den normalen Newsticker sind um 1 kleiner als die fuer die mobil version
222 m.link = self.BASE % suffix
224 self.log.info('appending meldung: %s', m)
225 self.meldungen.append(m)
227 return self.meldungen
230 HeiseParser = HeiseParserXMLObject
232 class HeiseFeedParser(object):
233 def __init__(self):
234 self.log = logging.getLogger('FeedParser')
235 self.meldungen = []
237 def fetch(self, url="http://heise-online.mobi/?seite=%d", index=0):
238 url = url % index
239 buf = urllib2.urlopen(url).read().decode('utf-8')
240 return buf
242 def fetch_and_parse(self):
243 buf = self.fetch()
244 return self.parse(buf)
246 def fetch_many_and_parse(self, count=5):
247 ret = []
248 for i in xrange(0, count+1):
249 buf = self.fetch( index=i )
250 ret += self.parse( buf )
251 return ret
254 def parse(self, html):
255 """Parses a html string and returns a list of parsed cases
256 """
257 self.log.debug(u'Calling parse() with locals: %s', locals())
259 p = HeiseParser()
260 p.feed(html)
261 self.meldungen.extend( p.meldungen )
262 return p.meldungen
264 def get_atom_entries(self):
265 entries = []
266 for (nr, meldung) in enumerate(self.meldungen):
267 self.log.debug('Trying to append meldung %s (%d/%d)', meldung, nr+1, len(self.meldungen))
268 entries.append(E.entry(
269 E.link(
270 href=meldung.link,
271 title=meldung.title,
272 ),
273 # E.author(
274 # E.name("Heise"),
275 # ),
276 E.published(format_date(meldung.time)),
277 E.updated(format_date(meldung.time)),
278 E.id("%d" % meldung.heisenr),
279 E.title(meldung.title),
280 E.summary(meldung.abstract),
281 # E.content(meldung.text,
282 # type="text/html"),
283 )
284 )
285 return entries
287 def get_latest_update(self):
288 dates = []
289 for meldung in self.meldungen:
290 dates.append(meldung.time)
291 return max(dates)
293 def to_atom(self):
294 atom_feed = (E.feed(
295 E.author(
296 E.name("heise online"),
297 ),
298 E.title("heise online News"),
299 E.subtitle("Nachrichten nicht nur aus der Welt der Computer"),
300 E.icon("http://heise-online.mobi//icons/logo.gif"),
301 E.id("http://www.heise.de/newsticker/"),
302 E.updated (format_date(self.get_latest_update())),
303 E.generator("Muellis ETree ATOM Generator"),
305 xmlns="http://www.w3.org/2005/Atom",
306 *self.get_atom_entries()
307 )
308 )
309 xmlstr = ET.tostring(atom_feed)
310 xmlheader = u'<?xml version="1.0" encoding="utf-8" ?>\n'
311 return xmlheader + xmlstr
313 def to_atom_file(self, fname):
314 f = file(fname, "w")
315 f.write(self.to_atom().encode('utf-8'))
316 f.close()
318 if __name__ == "__main__":
319 from optparse import OptionParser
320 parser = OptionParser("usage: %prog [options] filename")
321 parser.add_option("-l", "--loglevel", dest="loglevel",
322 help="Sets the loglevel to one of debug, info, warn,"
323 " error, critical", default="error")
324 # parser.add_option("-q", "--quiet",
325 # action="store_false", dest="verbose", default=True,
326 # help="don't print status messages to stdout")
327 (options, args) = parser.parse_args()
328 loglevel = {'debug': logging.DEBUG, 'info': logging.INFO,
329 'warn': logging.WARN, 'error': logging.ERROR,
330 'critical': logging.CRITICAL}.get(options.loglevel, "error")
331 logging.basicConfig(level=loglevel)
332 log = logging.getLogger("HeiseClients Main")
334 if len(args) > 0:
335 path = args[0]
336 else:
337 path = "heise-atom.xml"
339 HFP = HeiseFeedParser()
340 items = HFP.fetch_many_and_parse()
341 assert(len(items)>0)
342 HFP.to_atom_file(path)