python: get google adsense earnings report_问答_开发者

I need a python script that gets the google adsense earnings and I found adsense scraper: http://pypi.python.org/pypi/adsense_scraper/0.5 It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:

Traceback (most recent call last):
  File "adsense_scraper.py", line 163, in <module>
    data = main()
  File "adsense_scraper.py", line 154, in main
    b = get_adsense(login, password)
  File "adsense_scraper.py", line 128, in get_adsense
    b.submit()
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
    self._journey('open', request)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
    r = func(*args, **kwargs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twi开发者_运维知识库ll-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
   "http", request, response, code, msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
    "refresh", msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
    response = urlopen(self, req, data)
  File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
  File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>

So the important thing is:

urllib2.URLError: <urlopen error unknown url type: 'http>

Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks

there are several errors with the package, you mentioned only the first one

1) twill package does not handle google's redirects correctly, adding

    newurl = newurl.strip( "'" )

to twill/other_packages/_mechanize_dist/_http.py:108 before

    newurl = _rfc3986.clean_url(newurl, "latin-1")

fixes that

2) you have to have the correct language set in adsense - English

3) there are several problems in the orignal adsense_scraper

#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill

Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master


Usage::

    from adsense_scraper import get_adsense, get_time_period
    b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
    rows = get_time_period(b, 'yesterday')
    # The summary data is always the first row with channel == ''
    print 'I earned this much yesterday: $%(earnings)s' % rows[0]

"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree

try:
    from html5lib import HTMLParser
    import twill.commands
except ImportError:
    print >>sys.stderr, """\
adsense_scraper has dependencies::

    Twill 0.9 http://twill.idyll.org/
    html5lib 0.11 http://code.google.com/p/html5lib/

Try this::

    $ easy_install twill html5lib
"""
    raise SystemExit()

__version__ = '0.5'

SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true&ltmpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="

TIME_PERIODS = [
    'today',
    'yesterday',
    'thismonth',
    'lastmonth',
    'sincelastpayment',
]


def parse_decimal(s):
    """Return an int or decimal.Decimal given a human-readable number

    """
    light_stripped = s.strip(u'\u20ac')
    stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
    try:
        int(stripped)
        return light_stripped
    except ValueError:
        pass
    try:
        float(stripped)
        return light_stripped
    except ValueError:
        return decimal.Decimal(stripped)


def parse_summary_table(doc):
    """
    Parse the etree doc for summarytable, returns::

        [{'channel': unicode,
          'impressions': int,
          'clicks': int,
          'ctr': decimal.Decimal,
          'ecpm': decimal.Decimal,
          'earnings': decimal.Decimal}]

    """
    for t in doc.findall('.//table'):
        if t.attrib.get('id') == 'summarytable':
            break
    else:
        raise ValueError("summary table not found")

    res = []
    FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
    for row in t.findall('.//tr'):
        celltext = []
        for c in row.findall('td'):
            tail = ''
            # adsense inserts an empty span if a row has a period in it, so
            # get the children and find the tail element to append to the text
            if c.find('a') and c.find('a').getchildren():
                tail = c.find('a').getchildren()[0].tail or ''
            celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))

        celltext = filter( lambda x: x != "" , celltext )
        if len(celltext) != len(FIELDS):
            continue
        try:
            value_cols = map(parse_decimal, celltext)
        except decimal.InvalidOperation:
            continue
        res.append(dict(zip(FIELDS, value_cols)))

    return res


def get_adsense(login, password):
    """Returns a twill browser instance after having logged in to AdSense
    with *login* and *password*.

    The returned browser will have all of the appropriate cookies set but may
    not be at the exact page that you want data from.

    """
    b = twill.commands.get_browser()
    b.go(SERVICE_LOGIN_BOX_URL)
    for form in b.get_all_forms():
        try:
            form['Email'] = login
            form['Passwd'] = password
        except ValueError:
            continue
        else:
            break
    else:
        raise ValueError("Could not find login form on page")
    b._browser.select_form(predicate=lambda f: f is form)
    b.submit()
    return b


def get_time_period(b, period):
    """Returns the parsed summarytable for the time period *period* given
    *b* which should be the result of a get_adsense call. *period* must be
    a time period that AdSense supports:
    ``'today'``, ``'yesterday'``, ``'thismonth'``,
    ``'lastmonth'``, ``'sincelastpayment'``.

    """
    b.go(OVERVIEW_URL + period)
    # The cElementTree treebuilder doesn't work reliably enough
    # to use directly, so we parse and then dump into cElementTree.
    doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
    return parse_summary_table(doc)


def main():
    try:
        login, password = sys.argv[1:]
    except ValueError:
        raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
    twill.set_output(StringIO())
    twill.commands.reset_browser()
    b = get_adsense(login, password)
    data = {}
    for period in TIME_PERIODS:
        data[period] = get_time_period(b, period)
    pprint.pprint(data)
    twill.set_output(None)
    return data

if __name__ == '__main__':
    data = main()