I have the following program, that is trying to upload a file (or files) to an image upload site, however I am struggling to find out how to parse the returned HTML to grab the direct link (contained in a <dd class="download"><input type="text" value="{hereisthelink}"></dd>
).
I have the code below:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pycurl
import urllib
import urlparse
import xml.dom.minidom
import StringIO
import sys
import gtk
import os
import imghdr
import locale
import gettext
try:
import pynotify
except:
print "Install pynotify. It's whoasome!"
APP="Uploadir Uploader"
DIR="locale"
locale.setlocale(locale.LC_ALL, '')
gettext.bindtextdomain(APP, DIR)
gettext.textdomain(APP)
_ = gettext.gettext
##STRINGS
uploading = _("Uploading image to Uploadir.")
oneimage = _("1 image has been successfully uploaded.")
multimages = _("images have been successfully uploaded.")
uploadfailed = _("Unable to upload to Uploadir.")
class Uploadir:
def __init__(self, args):
self.images = []
self.urls = []
self.broadcasts = []
self.username=""
self.password=""
if len(args) == 1:
return
else:
for file in args:
if file == args[0] or file == "":
continue
if file.startswith("-u"):
self.username = file.split("-u")[1]
#print self.username
continue
if file.startswith("-p"):
self.password = file.split("-p")[1]
#print self.password
continue
self.type = imghdr.what(file)
self.images.append(file)
for file in self.images:
self.upload(file)
self.setClipBoard()
self.broadcast(self.broadcasts)
def broadcast(self, l):
try:
str = '\n'.join(l)
n = pynotify.Notification(str)
n.set_urgency(pynotify.URGENCY_LOW)
n.show()
except:
for line in l:
print line
d开发者_如何学JAVAef upload(self, file):
#Try to login
cookie_file_name = "/tmp/uploadircookie"
if ( self.username!="" and self.password!=""):
print "Uploadir authentication in progress"
l=pycurl.Curl()
loginData = [ ("username",self.username),("password", self.password), ("login", "Login") ]
l.setopt(l.URL, "http://uploadir.com/user/login")
l.setopt(l.HTTPPOST, loginData)
l.setopt(l.USERAGENT,"User-Agent: Uploadir (Python Image Uploader)")
l.setopt(l.FOLLOWLOCATION,1)
l.setopt(l.COOKIEFILE,cookie_file_name)
l.setopt(l.COOKIEJAR,cookie_file_name)
l.setopt(l.HEADER,1)
loginDataReturnedBuffer = StringIO.StringIO()
l.setopt( l.WRITEFUNCTION, loginDataReturnedBuffer.write )
if l.perform():
self.broadcasts.append("Login failed. Please check connection.")
l.close()
return
loginDataReturned = loginDataReturnedBuffer.getvalue()
l.close()
#print loginDataReturned
if loginDataReturned.find("<li>Your supplied username or password is invalid.</li>")!=-1:
self.broadcasts.append("Uploadir authentication failed. Username/password invalid.")
return
else:
self.broadcasts.append("Uploadir authentication successful.")
#cookie = loginDataReturned.split("Set-Cookie: ")[1]
#cookie = cookie.split(";",0)
#print cookie
c = pycurl.Curl()
values = [
("file", (c.FORM_FILE, file)),
("terms", "1"),
("submit", "submit")
]
buf = StringIO.StringIO()
c.setopt(c.URL, "http://uploadir.com/file/upload")
c.setopt(c.HTTPPOST, values)
c.setopt(c.COOKIEFILE, cookie_file_name)
c.setopt(c.COOKIEJAR, cookie_file_name)
c.setopt(c.WRITEFUNCTION, buf.write)
if c.perform():
self.broadcasts.append(uploadfailed+" "+file+".")
c.close()
return
self.result = buf.getvalue()
#print self.result
c.close()
doc = urlparse.urlparse(self.result)
print doc
self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
def setClipBoard(self):
c = gtk.Clipboard()
c.set_text('\n'.join(self.urls))
c.store()
if len(self.urls) == 1:
self.broadcasts.append(oneimage)
elif len(self.urls) != 0:
self.broadcasts.append(str(len(self.urls))+" "+multimages)
if __name__ == '__main__':
uploadir = Uploadir(sys.argv)
The code that deals with the HTML parsing is here:
doc = urlparse.urlparse(self.result)
self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
The urlparse
module has nothing to do with parsing HTML. All it does is break a URL up into bits: protocol, network address, path, etc. For example:
>>> urlparse.urlparse("http://www.stackoverflow.com/questions/4699888") ParseResult(scheme='http', netloc='www.stackoverflow.com', path='/questions/4699888', params='', query='', fragment='')
For parsing HTML, try BeautifulSoup.
精彩评论