| rev |
line source |
|
muelli@6
|
1 #!/usr/bin/env python
|
|
muelli@6
|
2 # -*- coding: utf-8 -*-
|
|
muelli@6
|
3 """
|
|
muelli@19
|
4 A simple module which parses a website and writes an ATOM file.
|
|
muelli@6
|
5 """
|
|
muelli@6
|
6
|
|
muelli@6
|
7 import monkeypatch
|
|
muelli@6
|
8
|
|
muelli@6
|
9 import datetime
|
|
muelli@6
|
10 import htmlentitydefs
|
|
muelli@40
|
11 import logging
|
|
muelli@19
|
12 import os
|
|
muelli@79
|
13 import re
|
|
muelli@7
|
14 import urllib2
|
|
muelli@6
|
15 import xml.etree.cElementTree as ET
|
|
muelli@36
|
16 from xml.parsers.expat import ExpatError
|
|
muelli@6
|
17
|
|
muelli@30
|
18 from Lazy import Lazy
|
|
muelli@6
|
19 import xmlobject
|
|
muelli@6
|
20 from deps.E import E
|
|
muelli@6
|
21
|
|
muelli@6
|
22
|
|
muelli@6
|
23 def format_date(date):
|
|
muelli@6
|
24 return date.isoformat()
|
|
muelli@6
|
25
|
|
muelli@6
|
26 class Logged():
|
|
muelli@6
|
27 def __init__(self, func):
|
|
muelli@6
|
28 self.func = func
|
|
muelli@6
|
29 def __call__(self):
|
|
muelli@6
|
30 pass
|
|
muelli@6
|
31
|
|
muelli@19
|
32 class Meldung(object):
|
|
muelli@71
|
33 TMPDIR = '/tmp/heise-atom'
|
|
muelli@71
|
34 TMPPATTERN_LINES = '%s/'% TMPDIR + 'heise-%d.html'
|
|
muelli@71
|
35 TMPPATTERN_TEXT = '%s/'% TMPDIR + 'heise-%d-text.html'
|
|
muelli@71
|
36
|
|
muelli@6
|
37 def __init__(self):
|
|
muelli@19
|
38 self.time = None
|
|
muelli@19
|
39 self.heisenr = None
|
|
muelli@6
|
40 self.link = None
|
|
muelli@19
|
41 self.abstract = None
|
|
muelli@19
|
42 self.title = None
|
|
muelli@74
|
43 #self.text = Lazy(self.get_heise_text) # Ah crap. How does one lazily evaluate stuff in Python?
|
|
muelli@71
|
44 self.log = logging.getLogger('HeiseMeldung')
|
|
muelli@6
|
45
|
|
muelli@6
|
46 def __unicode__(self):
|
|
muelli@19
|
47 return u"<Meldung %s from %s>" % (self.heisenr, self.time)
|
|
muelli@6
|
48 def __str__(self):
|
|
muelli@19
|
49 return "<Meldung %s from %s>" % (self.heisenr, self.time)
|
|
muelli@76
|
50
|
|
muelli@71
|
51 def _get_heise_text_lines_cached( self ):
|
|
muelli@19
|
52 TMPDIR = self.TMPDIR
|
|
muelli@71
|
53 nr = self.heisenr
|
|
muelli@23
|
54 cache_fname = self.TMPPATTERN_LINES % ( nr, )
|
|
muelli@23
|
55 try:
|
|
muelli@76
|
56 data = [line.decode('utf-8')
|
|
muelli@76
|
57 for line in file(cache_fname, 'r').readlines()]
|
|
muelli@19
|
58 self.log.debug('Cache HIT! %d', nr)
|
|
muelli@19
|
59 except IOError, e:
|
|
muelli@71
|
60 data = self._get_heise_text_lines_noncached()
|
|
muelli@19
|
61 self.log.debug('Cache MISS %d', nr)
|
|
muelli@24
|
62 try:
|
|
muelli@24
|
63 os.makedirs(TMPDIR, 0700)
|
|
muelli@24
|
64 except OSError, e:
|
|
muelli@24
|
65 if not e.errno == 17:
|
|
muelli@24
|
66 raise
|
|
muelli@24
|
67
|
|
muelli@77
|
68 file(self.TMPPATTERN_LINES % (nr, ), 'w' ).write(
|
|
muelli@77
|
69 os.linesep.join(
|
|
muelli@77
|
70 [line.encode('utf-8') for line in data]
|
|
muelli@77
|
71 )
|
|
muelli@77
|
72 )
|
|
muelli@24
|
73
|
|
muelli@19
|
74 return data
|
|
muelli@19
|
75
|
|
muelli@71
|
76 def _get_heise_text_lines_noncached( self ):
|
|
muelli@71
|
77 url = self.link
|
|
muelli@71
|
78 self.log.debug('Trying to fetch and save %s as %d', url, self.heisenr)
|
|
muelli@22
|
79 lines = [line.decode('utf-8') for line in urllib2.urlopen(url).readlines()]
|
|
muelli@24
|
80
|
|
muelli@19
|
81 return lines
|
|
muelli@21
|
82
|
|
muelli@71
|
83 def _get_heise_text_cached(self):
|
|
muelli@71
|
84 nr = self.heisenr
|
|
muelli@21
|
85 cache_fname = self.TMPPATTERN_TEXT % ( nr, )
|
|
muelli@21
|
86 try:
|
|
muelli@21
|
87 data = file(cache_fname, 'r').read().decode('utf-8')
|
|
muelli@21
|
88 self.log.debug('Text Cache HIT! %d', nr)
|
|
muelli@21
|
89 assert len(data) > 0
|
|
muelli@21
|
90 except IOError, e:
|
|
muelli@71
|
91 data = self._get_heise_text_noncached()
|
|
muelli@21
|
92 self.log.debug('Text Cache MISS %d', nr)
|
|
muelli@22
|
93 file(cache_fname, 'w').write(data.encode('utf-8'))
|
|
muelli@76
|
94
|
|
muelli@21
|
95 return data
|
|
muelli@21
|
96
|
|
muelli@76
|
97
|
|
muelli@71
|
98 def _get_heise_text_noncached(self):
|
|
muelli@19
|
99 #return "notext"
|
|
muelli@71
|
100 lines = self._get_heise_text_lines_cached()
|
|
muelli@19
|
101 self.log.debug('Read %d lines', len(lines))
|
|
muelli@75
|
102 start = [nr for (nr, line) in enumerate( lines )
|
|
muelli@75
|
103 if line.strip().startswith('<div id="artikel">')]
|
|
muelli@19
|
104 if not len(start) == 1:
|
|
muelli@19
|
105 self.log.warn('No Start (%d) of Meldung found in %s', len(start), lines)
|
|
muelli@19
|
106 return "No start found in \n\n %s" % "\n".join(lines)
|
|
muelli@19
|
107 start = start[0]
|
|
muelli@75
|
108 end = [nr for (nr, line) in enumerate( lines[start:] )
|
|
muelli@75
|
109 if line.strip().startswith('</div>')]
|
|
muelli@19
|
110 if not len(end) >= 1:
|
|
muelli@19
|
111 self.log.warn('No end (%d) of Meldung found in %s', len(end), lines)
|
|
muelli@19
|
112 return "No end found in \n\n %s" % "\n".join(lines)
|
|
muelli@19
|
113 end = end[0]
|
|
muelli@25
|
114 text = unicode("\n".join( lines[start:start+end] ))
|
|
muelli@50
|
115 self.log.debug('Found Text for %d between %d and %d: %s',
|
|
muelli@50
|
116 nr, start, end, text)
|
|
muelli@19
|
117 return text
|
|
muelli@19
|
118
|
|
muelli@73
|
119 def get_heise_text(self):
|
|
muelli@73
|
120 text = self._get_heise_text_cached()
|
|
muelli@21
|
121
|
|
muelli@21
|
122 return text
|
|
muelli@74
|
123
|
|
muelli@74
|
124 @Lazy
|
|
muelli@74
|
125 def text(self):
|
|
muelli@74
|
126 return self.get_heise_text()
|
|
muelli@71
|
127
|
|
muelli@71
|
128 class HeiseParserXMLObject(object):
|
|
muelli@71
|
129 BASE = "http://www.heise.de/newsticker/meldung/%s"
|
|
muelli@71
|
130 #BASE = "http://www.heise.de/newsticker/meldung/%s?view=print"
|
|
muelli@71
|
131 BASE = 'http://heise-online.mobi/%s'
|
|
muelli@71
|
132
|
|
muelli@71
|
133
|
|
muelli@71
|
134 def __init__(self, *args, **kwargs):
|
|
muelli@71
|
135 self.log = logging.getLogger('ParserXMLObject')
|
|
muelli@71
|
136 self.meldungen = []
|
|
muelli@71
|
137
|
|
muelli@71
|
138
|
|
muelli@71
|
139
|
|
muelli@19
|
140 def feed(self, xml):
|
|
muelli@79
|
141 heisestring = unicode(xml).replace('\n', '').replace('\t', '')
|
|
muelli@79
|
142
|
|
muelli@79
|
143 artikel_xml = ""
|
|
muelli@79
|
144 heise_regex = """(<div id="news_([0-9]+)".*?class="artikel">.*?</div>)"""
|
|
muelli@79
|
145 for artikel, nr in re.findall(heise_regex, heisestring):
|
|
muelli@79
|
146 artikel_xml += "%s\n" % artikel
|
|
muelli@79
|
147
|
|
muelli@19
|
148
|
|
muelli@79
|
149 heise_xml = '<?xml version="1.0" encoding="utf-8"?>'
|
|
muelli@79
|
150 xmlo_raw = u"%s\n<news>%s</news>" % (heise_xml, artikel_xml)
|
|
muelli@79
|
151
|
|
muelli@79
|
152 ## Fix XHTML Errors
|
|
muelli@79
|
153 broken = (
|
|
muelli@60
|
154 ' & ',
|
|
muelli@57
|
155 '1&1',
|
|
muelli@54
|
156 'AT&T',
|
|
muelli@54
|
157 'PG&E',
|
|
muelli@54
|
158 'Command&Control-Servers',
|
|
muelli@58
|
159 'Giesecke & Devrient',
|
|
muelli@59
|
160 'Technology & Operations',
|
|
muelli@59
|
161 'a&o',
|
|
muelli@62
|
162 'S&P',
|
|
muelli@64
|
163 'Look&Feel'
|
|
muelli@65
|
164 'ES&S',
|
|
muelli@79
|
165 )
|
|
muelli@79
|
166 for chunk in broken:
|
|
muelli@79
|
167 xmlo_raw = xmlo_raw.replace('%s' % chunk,
|
|
muelli@55
|
168 '%s' % chunk.replace('&', '&'))
|
|
muelli@51
|
169
|
|
muelli@79
|
170 self.log.debug('trying to create XML Object from\n%s\n%s',
|
|
muelli@47
|
171 str(type(xmlo_raw)), xmlo_raw)
|
|
muelli@79
|
172 try:
|
|
muelli@79
|
173 xmlo = xmlobject.XMLFile( raw = xmlo_raw.encode('utf-8'))
|
|
muelli@79
|
174 except ExpatError, e:
|
|
muelli@79
|
175 self.log.critical("Couldn't create XMLObject from\n%s",
|
|
muelli@79
|
176 xmlo_raw)
|
|
muelli@56
|
177
|
|
muelli@79
|
178 if hasattr(e, 'lineno'):
|
|
muelli@79
|
179 offending_lineno = e.lineno
|
|
muelli@79
|
180 xml_lines = xmlo_raw.splitlines()
|
|
muelli@79
|
181 offending_line = xml_lines[offending_lineno-1]
|
|
muelli@79
|
182 self.log.critical("Offending line: %d\n%s",
|
|
muelli@79
|
183 offending_lineno,
|
|
muelli@79
|
184 offending_line)
|
|
muelli@56
|
185
|
|
muelli@79
|
186 offending_char = e.offset
|
|
muelli@79
|
187 first_space, last_space = offending_line.rfind(' ', 0, offending_char), offending_line.find(' ', offending_char)
|
|
muelli@79
|
188 if first_space == -1: first_space = 0
|
|
muelli@79
|
189 if last_space == -1: last_space = -1
|
|
muelli@56
|
190
|
|
muelli@79
|
191 offending_word = offending_line[first_space:last_space]
|
|
muelli@79
|
192 self.log.critical("Offending word:\n%s",
|
|
muelli@79
|
193 offending_word)
|
|
muelli@79
|
194 raise e
|
|
muelli@19
|
195
|
|
muelli@19
|
196
|
|
muelli@79
|
197 try:
|
|
muelli@79
|
198 meldungen = xmlo.root.div
|
|
muelli@79
|
199 except (AttributeError, ), e:
|
|
muelli@79
|
200 self.log.critical("Error getting a meldung from\n%s", xmlo_raw)
|
|
muelli@79
|
201 raise
|
|
muelli@19
|
202
|
|
muelli@79
|
203 for meldung in meldungen:
|
|
muelli@79
|
204 m = Meldung()
|
|
muelli@79
|
205 heisenr = int(meldung.id.strip('news_'))
|
|
muelli@79
|
206 m.heisenr = heisenr
|
|
muelli@79
|
207
|
|
muelli@79
|
208 datum_raw = meldung.span._text
|
|
muelli@79
|
209 datum = datetime.datetime.strptime(datum_raw,
|
|
muelli@79
|
210 '%d.%m.%Y - %H:%M Uhr')
|
|
muelli@79
|
211 m.time = datum
|
|
muelli@79
|
212
|
|
muelli@79
|
213 title_raw = meldung.h3.a._text
|
|
muelli@79
|
214 m.title = title_raw
|
|
muelli@79
|
215
|
|
muelli@79
|
216 abstract = meldung.p._text
|
|
muelli@79
|
217 m.abstract = abstract
|
|
muelli@79
|
218
|
|
muelli@79
|
219
|
|
muelli@79
|
220 suffix = meldung.h3.a.href
|
|
muelli@79
|
221 #link = self.BASE % suffix[len('/news/'):] #Heise ist doof: Die Heisenummern in den URLs fuer den normalen Newsticker sind um 1 kleiner als die fuer die mobil version
|
|
muelli@79
|
222 m.link = self.BASE % suffix
|
|
muelli@79
|
223
|
|
muelli@79
|
224 self.log.info('appending meldung: %s', m)
|
|
muelli@79
|
225 self.meldungen.append(m)
|
|
muelli@41
|
226
|
|
muelli@45
|
227 return self.meldungen
|
|
muelli@19
|
228
|
|
muelli@19
|
229
|
|
muelli@19
|
230 HeiseParser = HeiseParserXMLObject
|
|
muelli@19
|
231
|
|
muelli@19
|
232 class HeiseFeedParser(object):
|
|
muelli@6
|
233 def __init__(self):
|
|
muelli@21
|
234 self.log = logging.getLogger('FeedParser')
|
|
muelli@19
|
235 self.meldungen = []
|
|
muelli@6
|
236
|
|
muelli@66
|
237 def fetch(self, url="http://heise-online.mobi/?seite=%d", index=0):
|
|
muelli@19
|
238 url = url % index
|
|
muelli@20
|
239 buf = urllib2.urlopen(url).read().decode('utf-8')
|
|
muelli@83
|
240 self.log.debug('Fetched %s', buf)
|
|
muelli@9
|
241 return buf
|
|
muelli@9
|
242
|
|
muelli@9
|
243 def fetch_and_parse(self):
|
|
muelli@9
|
244 buf = self.fetch()
|
|
muelli@9
|
245 return self.parse(buf)
|
|
muelli@19
|
246
|
|
muelli@19
|
247 def fetch_many_and_parse(self, count=5):
|
|
muelli@42
|
248 ret = []
|
|
muelli@80
|
249 for i in xrange(0, count+1):
|
|
muelli@42
|
250 buf = self.fetch( index=i )
|
|
muelli@42
|
251 ret += self.parse( buf )
|
|
muelli@19
|
252 return ret
|
|
muelli@19
|
253
|
|
muelli@9
|
254
|
|
muelli@6
|
255 def parse(self, html):
|
|
muelli@6
|
256 """Parses a html string and returns a list of parsed cases
|
|
muelli@6
|
257 """
|
|
muelli@6
|
258 self.log.debug(u'Calling parse() with locals: %s', locals())
|
|
muelli@6
|
259
|
|
muelli@19
|
260 p = HeiseParser()
|
|
muelli@6
|
261 p.feed(html)
|
|
muelli@19
|
262 self.meldungen.extend( p.meldungen )
|
|
muelli@19
|
263 return p.meldungen
|
|
muelli@6
|
264
|
|
muelli@6
|
265 def get_atom_entries(self):
|
|
muelli@6
|
266 entries = []
|
|
muelli@21
|
267 for (nr, meldung) in enumerate(self.meldungen):
|
|
muelli@21
|
268 self.log.debug('Trying to append meldung %s (%d/%d)', meldung, nr+1, len(self.meldungen))
|
|
muelli@6
|
269 entries.append(E.entry(
|
|
muelli@13
|
270 E.link(
|
|
muelli@19
|
271 href=meldung.link,
|
|
muelli@19
|
272 title=meldung.title,
|
|
muelli@13
|
273 ),
|
|
muelli@19
|
274 # E.author(
|
|
muelli@19
|
275 # E.name("Heise"),
|
|
muelli@19
|
276 # ),
|
|
muelli@19
|
277 E.published(format_date(meldung.time)),
|
|
muelli@19
|
278 E.updated(format_date(meldung.time)),
|
|
muelli@70
|
279 E.id("%d" % meldung.heisenr),
|
|
muelli@19
|
280 E.title(meldung.title),
|
|
muelli@19
|
281 E.summary(meldung.abstract),
|
|
muelli@27
|
282 # E.content(meldung.text,
|
|
muelli@27
|
283 # type="text/html"),
|
|
muelli@6
|
284 )
|
|
muelli@6
|
285 )
|
|
muelli@6
|
286 return entries
|
|
muelli@6
|
287
|
|
muelli@6
|
288 def get_latest_update(self):
|
|
muelli@6
|
289 dates = []
|
|
muelli@19
|
290 for meldung in self.meldungen:
|
|
muelli@19
|
291 dates.append(meldung.time)
|
|
muelli@6
|
292 return max(dates)
|
|
muelli@6
|
293
|
|
muelli@6
|
294 def to_atom(self):
|
|
muelli@6
|
295 atom_feed = (E.feed(
|
|
muelli@19
|
296 E.author(
|
|
muelli@19
|
297 E.name("heise online"),
|
|
muelli@19
|
298 ),
|
|
muelli@19
|
299 E.title("heise online News"),
|
|
muelli@19
|
300 E.subtitle("Nachrichten nicht nur aus der Welt der Computer"),
|
|
muelli@19
|
301 E.icon("http://heise-online.mobi//icons/logo.gif"),
|
|
muelli@19
|
302 E.id("http://www.heise.de/newsticker/"),
|
|
muelli@6
|
303 E.updated (format_date(self.get_latest_update())),
|
|
muelli@6
|
304 E.generator("Muellis ETree ATOM Generator"),
|
|
muelli@6
|
305
|
|
muelli@6
|
306 xmlns="http://www.w3.org/2005/Atom",
|
|
muelli@6
|
307 *self.get_atom_entries()
|
|
muelli@6
|
308 )
|
|
muelli@6
|
309 )
|
|
muelli@6
|
310 xmlstr = ET.tostring(atom_feed)
|
|
muelli@20
|
311 xmlheader = u'<?xml version="1.0" encoding="utf-8" ?>\n'
|
|
muelli@10
|
312 return xmlheader + xmlstr
|
|
muelli@10
|
313
|
|
muelli@10
|
314 def to_atom_file(self, fname):
|
|
muelli@10
|
315 f = file(fname, "w")
|
|
muelli@20
|
316 f.write(self.to_atom().encode('utf-8'))
|
|
muelli@10
|
317 f.close()
|
|
muelli@10
|
318
|
|
muelli@10
|
319 if __name__ == "__main__":
|
|
muelli@40
|
320 from optparse import OptionParser
|
|
muelli@78
|
321 parser = OptionParser("usage: %prog [options] filename")
|
|
muelli@40
|
322 parser.add_option("-l", "--loglevel", dest="loglevel",
|
|
muelli@40
|
323 help="Sets the loglevel to one of debug, info, warn,"
|
|
muelli@43
|
324 " error, critical", default="error")
|
|
muelli@40
|
325 # parser.add_option("-q", "--quiet",
|
|
muelli@40
|
326 # action="store_false", dest="verbose", default=True,
|
|
muelli@40
|
327 # help="don't print status messages to stdout")
|
|
muelli@40
|
328 (options, args) = parser.parse_args()
|
|
muelli@40
|
329 loglevel = {'debug': logging.DEBUG, 'info': logging.INFO,
|
|
muelli@40
|
330 'warn': logging.WARN, 'error': logging.ERROR,
|
|
muelli@43
|
331 'critical': logging.CRITICAL}.get(options.loglevel, "error")
|
|
muelli@40
|
332 logging.basicConfig(level=loglevel)
|
|
muelli@40
|
333 log = logging.getLogger("HeiseClients Main")
|
|
muelli@40
|
334
|
|
muelli@40
|
335 if len(args) > 0:
|
|
muelli@40
|
336 path = args[0]
|
|
muelli@10
|
337 else:
|
|
muelli@19
|
338 path = "heise-atom.xml"
|
|
muelli@10
|
339
|
|
muelli@19
|
340 HFP = HeiseFeedParser()
|
|
muelli@19
|
341 items = HFP.fetch_many_and_parse()
|
|
muelli@10
|
342 assert(len(items)>0)
|
|
muelli@52
|
343 HFP.to_atom_file(path)
|