I'm trying to read webpages using pyqt. I need to call a method multiple times with different URLs. I am currently using code similar to: http://blog.sitescraper.net/2010/06/scraping-javascript-webpages-in-python.html#comment-form
However when I try I get seg faults. Any suggestions welcome.
import sys
from time import clock
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import *
class Render(QWebPage):
def __init__(self):
self.app = QApplication(sys.argv)
QWebPage.__init__开发者_Python百科(self)
self.networkAccessManager().finished.connect(self.handleEnd)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
def loadURL(self, url):
self.mainFrame().load(QUrl(url))
self.app.exec_()
def savePageImage (self, width, height, Imagefile):
pageSize = self.mainFrame().contentsSize();
if width == 0:
pageWidth = pageSize.width()
else:
pageWidth = width
if height == 0:
pageHeight = pageSize.height()
else:
pageHeight = height
self.setViewportSize(QSize(pageWidth, pageHeight))
Img = QImage(self.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(Img)
self.mainFrame().render(painter)
painter.end()
Img.save(Imagefile)
def _loadFinished(self, result):
print "load finish"
self.frame = self.mainFrame()
self.returnVal = result
self.app.quit()
def handleEnd (self, reply):
# get first http code and disconnect
# could add filter to listen relevant responses
self.httpcode = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
self.networkAccessManager().finished.disconnect(self.handleEnd)
jsrurl = 'http://www.w3resource.com/javascript/document-alert-confirm/four.html'
badurl='something.or.other'
badhttp = 'http://eclecticself.com/test2.html'
testurl = 'http://www.nydailynews.com/entertainment/index.html'
testurl2 = 'http://www.palmbeachpost.com/'
testurl3 = 'http://www.nydailynews.com/news/politics/2011/08/03/2011-08-03_pat_buchanan_downplays_controversy_after_calling_president_obama_your_boy_to_rev.html'
url = testurl
start = clock()
r = Render()
r.loadURL(url)
html = r.frame.toHtml()
elapsed = clock() - start
print elapsed
if (r.returnVal == True):
if (r.httpcode.toInt()[0] != 404):
#print html.toUtf8()
start = clock()
r.savePageImage(1024, 0, "pageSnapshot.png")
elapsed = clock() - start
print elapsed
else:
print 'page not found'
else:
print 'badurl'
s = Render()
s.loadURL(jsrurl)
html = s.frame.toHtml()
elapsed = clock() - start
print elapsed
if (s.returnVal == True):
if (s.httpcode.toInt()[0] != 404):
print html.toUtf8()
start = clock()
s.savePageImage(1024, 0, "pageSnapshot.png")
elapsed = clock() - start
print elapsed
else:
print 'page not found'
else:
print 'badurl'
PyQt is often forgetting to keep references to objects. Workarounds:
Try to use PySide instead of PyQt, it is easy, since the API is almost completely the same as PyQt. I would try PySide first, it might solve your problem immediately or at least make it predictable and reproducible.
Try to keep references to all the Qt objects you are using and remove those references when you're done with the objects. You can also try to explicitly close them or navigate to "about:blank" before going to the next Web page.
It usually helps. If not, then you need to narrow it down as utdemir suggested it above. Debugging usually not help, since such issues are often timing related as well. Logging without an output buffer usually helps you get closer to the source of the problem.
I'm with you in soul, such issues are hard to track down!
精彩评论