pyqt4 seg fault sequential app start stop_问答_开发者

I'm trying to read webpages using pyqt. I need to call a method multiple times with different URLs. I am currently using code similar to: http://blog.sitescraper.net/2010/06/scraping-javascript-webpages-in-python.html#comment-form

However when I try I get seg faults. Any suggestions welcome.

import sys

from time import clock
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import *

class Render(QWebPage):
  def __init__(self):
    self.app = QApplication(sys.argv)
    QWebPage.__init__开发者_Python百科(self)

    self.networkAccessManager().finished.connect(self.handleEnd)
    self.loadFinished.connect(self._loadFinished)

    self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
    self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

  def loadURL(self, url):
    self.mainFrame().load(QUrl(url))
    self.app.exec_()

  def savePageImage (self, width, height, Imagefile):
    pageSize = self.mainFrame().contentsSize();
    if width == 0:
        pageWidth = pageSize.width()
    else:
        pageWidth = width
    if height == 0:
        pageHeight = pageSize.height()
    else:
        pageHeight = height

    self.setViewportSize(QSize(pageWidth, pageHeight))
    Img = QImage(self.viewportSize(), QImage.Format_ARGB32)
    painter = QPainter(Img)
    self.mainFrame().render(painter)
    painter.end()
    Img.save(Imagefile)


  def _loadFinished(self, result):
    print "load finish"
    self.frame = self.mainFrame()
    self.returnVal = result 
    self.app.quit()

  def handleEnd (self, reply):
    # get first http code and disconnect
    # could add filter to listen relevant responses
    self.httpcode = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
    self.networkAccessManager().finished.disconnect(self.handleEnd)


jsrurl = 'http://www.w3resource.com/javascript/document-alert-confirm/four.html'
badurl='something.or.other'
badhttp = 'http://eclecticself.com/test2.html'
testurl = 'http://www.nydailynews.com/entertainment/index.html'
testurl2 = 'http://www.palmbeachpost.com/'
testurl3 = 'http://www.nydailynews.com/news/politics/2011/08/03/2011-08-03_pat_buchanan_downplays_controversy_after_calling_president_obama_your_boy_to_rev.html'
url = testurl



start = clock()
r = Render()
r.loadURL(url)
html = r.frame.toHtml()
elapsed = clock() - start
print elapsed

if (r.returnVal == True):
    if (r.httpcode.toInt()[0] != 404):
        #print html.toUtf8()
        start = clock()
        r.savePageImage(1024, 0, "pageSnapshot.png")
        elapsed = clock() - start
        print elapsed
    else:
        print 'page not found'
else:
    print 'badurl'

s = Render()
s.loadURL(jsrurl)
html = s.frame.toHtml()
elapsed = clock() - start
print elapsed
if (s.returnVal == True):
    if (s.httpcode.toInt()[0] != 404):
        print html.toUtf8()
        start = clock()
        s.savePageImage(1024, 0, "pageSnapshot.png")
        elapsed = clock() - start
        print elapsed
    else:
        print 'page not found'
else:
    print 'badurl'

PyQt is often forgetting to keep references to objects. Workarounds:

Try to use PySide instead of PyQt, it is easy, since the API is almost completely the same as PyQt. I would try PySide first, it might solve your problem immediately or at least make it predictable and reproducible.
Try to keep references to all the Qt objects you are using and remove those references when you're done with the objects. You can also try to explicitly close them or navigate to "about:blank" before going to the next Web page.

It usually helps. If not, then you need to narrow it down as utdemir suggested it above. Debugging usually not help, since such issues are often timing related as well. Logging without an output buffer usually helps you get closer to the source of the problem.

I'm with you in soul, such issues are hard to track down!