Difference between revisions of "ES/Desarrollo/Syndication"
Line 38: | Line 38: | ||
import HTMLParser | import HTMLParser | ||
import urllib2 | import urllib2 | ||
− | import | + | import MySQLdb |
import smtplib | import smtplib | ||
Line 62: | Line 62: | ||
− | class | + | class HeadlinesParser(HTMLParser.HTMLParser): |
def __init__(self): | def __init__(self): | ||
HTMLParser.HTMLParser.__init__(self) | HTMLParser.HTMLParser.__init__(self) | ||
Line 74: | Line 74: | ||
if "href" in attrs: | if "href" in attrs: | ||
current_href = attrs["href"] | current_href = attrs["href"] | ||
− | if " | + | if "newsItemID" in current_href: |
self.while_anchor = True | self.while_anchor = True | ||
self.last_href = current_href | self.last_href = current_href | ||
Line 140: | Line 140: | ||
s=SMTP(host) | s=SMTP(host) | ||
if loginUsername and loginPassword: | if loginUsername and loginPassword: | ||
− | s.login( | + | s.login(loginUserName,loginPassword) |
x=s.sendmail(de,para, msg.as_string(0)) | x=s.sendmail(de,para, msg.as_string(0)) | ||
s.quit() | s.quit() | ||
Line 150: | Line 150: | ||
try: | try: | ||
− | db = | + | db = MySQLdb.Connect(host=myhost,user=myuser,passwd=mypassword,db=mydb) |
return db | return db | ||
except: | except: | ||
Line 157: | Line 157: | ||
− | def | + | def insertarHeadLines(datos): |
"""Funcion para insertar los enlaces en mysql""" | """Funcion para insertar los enlaces en mysql""" | ||
Line 190: | Line 190: | ||
## print u'Enlace repetido' | ## print u'Enlace repetido' | ||
except: | except: | ||
− | print u'Problemas al realizar los | + | print u'Problemas al realizar los INSERT' |
return -1 | return -1 | ||
Line 238: | Line 238: | ||
import urllib2 | import urllib2 | ||
− | headlines_url = 'http://es.openoffice.org/servlets/ | + | headlines_url = 'http://es.openoffice.org/servlets/ProjectDocumentList?folderID=276&expandFolder=276&folderID=0' |
− | class | + | class HeadlinesParser(HTMLParser.HTMLParser): |
def __init__(self): | def __init__(self): | ||
HTMLParser.HTMLParser.__init__(self) | HTMLParser.HTMLParser.__init__(self) | ||
Line 247: | Line 247: | ||
self.while_anchor = False | self.while_anchor = False | ||
self.last_href = None | self.last_href = None | ||
− | self.mysearch = ' | + | self.mysearch = 'ProjectDocumentList' |
def handle_starttag(self, name, attrs): | def handle_starttag(self, name, attrs): | ||
Line 269: | Line 269: | ||
def main(): | def main(): | ||
source = urllib2.urlopen(headlines_url).read() | source = urllib2.urlopen(headlines_url).read() | ||
− | parser = | + | parser = HeadlinesParser() |
parser.feed(source) | parser.feed(source) | ||
Line 276: | Line 276: | ||
urldoc = headline[1].decode('utf8').encode('latin1') | urldoc = headline[1].decode('utf8').encode('latin1') | ||
docsource = urllib2.urlopen(urldoc).read() | docsource = urllib2.urlopen(urldoc).read() | ||
− | docparser = | + | docparser = HeadlinesParser() |
docparser.mysearch = 'documents' | docparser.mysearch = 'documents' | ||
docparser.feed(docsource) | docparser.feed(docsource) |
Revision as of 19:23, 30 May 2007
Syndicating OOo website
This script will parse and syndicate OOoES Announcement section and then register it to a Database Backend. This is still a simple script and has no other functionality than parse the annoucment page you can change the project subdomain and you will be able to parse your announcments.
TODO:
- Loop over multiple urls
- Parse other documents such as Documentation
[python]
- !/usr/bin/env python
- -*- coding: utf-8 -*-
- Copyright (C) 2005 by Alexandro Colorado && Luis Cabrera
- Author:
- Alexandro Colorado <jza@openoffice.org>
- Luis Cabrera <lcabrera@sauco.org>
- Copyright: See COPYING file that comes with this distribution
-
__shell_usage__ = Este programa esta pensado para recoger automaticamente
los enlaces localizados en una determinada pagina, compararlos con los ya
existentes en una bbdd, y, si hay nuevos enlaces, insertarlos en la bbdd y
posteriormente, mandar un correo a las listas de usuarios predeterminadas
con los enlaces a las nuevas noticias.
Si no hubieran noticias nuevas, el programa no debera hacer absolutamente
nada.
__version__ = '20051024'
__author__='Luis Cabrera Sauco && Alexandro Colorado'
import HTMLParser
import urllib2
import MySQLdb
import smtplib
- CONFIGURATION VARIABLES ###############
Pagina de la que recoger los enlaces iniciales
headlines_url =
Datos para la configuracia de la bbdd
myhost= ""
myuser=""
mypassword=""
mydb=""
mytb=""
Datos para el envio de correos de avisos
enviar_correo=1
remitente=""
destinatario=""
subject=""
-
class HeadlinesParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.headlines = {}
self.while_anchor = False
self.last_href = None
def handle_starttag(self, name, attrs):
attrs = dict(attrs)
if name == "a":
if "href" in attrs:
current_href = attrs["href"]
if "newsItemID" in current_href:
self.while_anchor = True
self.last_href = current_href
def handle_data(self, content):
content = content.strip()
if self.while_anchor:
self.headlines[content] = self.last_href
def handle_endtag(self, name):
if name == "a":
self.while_anchor = False
def sendMessage(de, para=[], cc=[], asunto='Asunto',texto='Cuerpo.', archivos=[],
host=, charsettexto='iso-8859-1', loginUsername=, loginPassword=):
""" Mandar mensaje por correo electronico. """
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email import Encoders
import mimetypes
from smtplib import SMTP
import os
msg=MIMEText(texto, _charset=charsettexto)
if archivos:
msg2=MIMEBase("multipart","mixed") # crear contenedor
msg2.preamble="Este es un mensaje MIME con varias partes.
"
es muy importante!!! (de lo contrario no reconoce primer anexo.)
msg2.epilogue=
msg2.add_payload(msg)
msg=msg2
# Cabeceras primero
msg['From']=de
if isinstance(para,(str,unicode)): para=[para]
msg['To']=", ".join(para)
if cc:
if isinstance(cc,(str,unicode)): cc=[cc]
msg['Cc']=", ".join(cc)
para.extend(cc)
msg['Subject']=asunto
for archivo in archivos:
ctype, encoding = mimetypes.guess_type(archivo)
if ctype is None or encoding is not None:
ctype = 'application/octet-stream'
maintype, subtype = ctype.split('/', 1)
if maintype == 'text':
msgparte = MIMEText(file(archivo).read(), _subtype=subtype)
else:
msgparte = MIMEBase(maintype, subtype)
msgparte.add_payload(file(archivo,"rb").read())
# Encode the payload using Base64
Encoders.encode_base64(msgparte)
msgparte.add_header('Content-Disposition', 'attachment',
filename=os.path.basename(archivo))
msg.attach(msgparte)
if not host:
return msg.as_string(0)
s=SMTP(host)
if loginUsername and loginPassword:
s.login(loginUserName,loginPassword)
x=s.sendmail(de,para, msg.as_string(0))
s.quit()
return x
def conectarBD():
"""Rutina de conexion a la BBDD"""
try:
db = MySQLdb.Connect(host=myhost,user=myuser,passwd=mypassword,db=mydb)
return db
except:
print u"Error en la conexion a la Base de Datos"
return -1
def insertarHeadLines(datos):
"""Funcion para insertar los enlaces en mysql"""
conn = conectarBD()
c = conn.cursor()
if (conn != -1):
try:
"""Insertamos los links"""
for headline in datos:
try:
titulo_noticia = headline[0].decode('utf8').encode('latin1')
enlace_noticia = headline[1]
c.execute(INSERT INTO mytb (headline, url) VALUES ('s')
% ( titulo_noticia, enlace_noticia ))
referencia_noticia = 'Acaba de ser a adida la siguiente noticia:
'+titulo_noticia+'
Pueden leer la noticia completa en el siguiente enlace:
'+enlace_noticia
referencia_noticia = referencia_noticia.decode('utf8').
encode('latin1')
if enviar_correo == 1:
sendMessage(de=remitente,para=destinatario,asunto=subject,
texto=referencia_noticia,host="localhost")
except:
pass
## print u'Enlace repetido'
except:
print u'Problemas al realizar los INSERT'
return -1
def main():
"""LLamada a las distintas funciones para conseguir los objetivos del programa"""
source = urllib2.urlopen(headlines_url).read()
parser = HeadlinesParser()
parser.feed(source)
lineas = parser.headlines.items()
try:
insertarHeadLines(lineas)
except:
print u'Problemas al insertar los enlaces.'
if __name__ == "__main__":
main()
Second script: Parse Documentation Page
[python]
- !/usr/bin/env python
- -*- coding: latin1 -*-
- Copyright (C) 2005 by Alexandro Colorado && Luis Cabrera
- Author:
- Alexandro Colorado <jza@openoffice.org>
- Luis Cabrera <lcabrera@sauco.org>
- Copyright: See COPYING file that comes with this distribution
-
__shell_usage__ =
__version__ = '20051030'
__author__='Luis Cabrera Sauco'
import HTMLParser
import urllib2
headlines_url = 'http://es.openoffice.org/servlets/ProjectDocumentList?folderID=276&expandFolder=276&folderID=0'
class HeadlinesParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.headlines = {}
self.while_anchor = False
self.last_href = None
self.mysearch = 'ProjectDocumentList'
def handle_starttag(self, name, attrs):
attrs = dict(attrs)
if name == "a":
if "href" in attrs:
current_href = attrs["href"]
if self.mysearch in current_href:
self.while_anchor = True
self.last_href = current_href
def handle_data(self, content):
content = content.strip()
if self.while_anchor:
self.headlines[content] = self.last_href
def handle_endtag(self, name):
if name == "a":
self.while_anchor = False
def main():
source = urllib2.urlopen(headlines_url).read()
parser = HeadlinesParser()
parser.feed(source)
for headline in parser.headlines.items():
# print insert into "Headlines" values ('s'); % headline
urldoc = headline[1].decode('utf8').encode('latin1')
docsource = urllib2.urlopen(urldoc).read()
docparser = HeadlinesParser()
docparser.mysearch = 'documents'
docparser.feed(docsource)
for datos in docparser.headlines.items():
print insert into "Headlines" values ('s',' (headline[0],datos[0].decode('utf8').encode('latin1'),datos[1].decode('utf8').encode('latin1'))
if __name__ == "__main__":
main()
-- AlexandroColorado - 23 Oct 2005
-- LuisCabreraSauco - 02 Nov 2005