AppStore::Scraper アップデート

AppStore::Scraper アップデート #

この前の iTunes のアップデートのタイミングで、また AppStore::Scraper が動かなくなった
スクレイピングはこれがあるからなぁ。。。。仕方ないので adhoc に書きなおした
今までと同じように使えるけど、レート毎の件数(☆ 1 つが何件とか)がとれなくなってます
そのかわりってわけじゃないが、

  • 件数とアベレージは取得できる
  • iPhone でも iPad でも動くアプリは、両方のランキング取得できる

今回から Search API 使ってます。今後はスクレイピングは最小限にして、Seach API でとれるものをベースに書き換える予定。。。だが、動いてるうちは書き換えんやろなぁ

前と変わらないけど、使い方

use AppStore::Scraper;
use Data::Dumper;

my $obj = AppStore::Scraper->new(wait => 5);
my $info = $obj->app_info(
                          app => ['404732112'],
                          store => ['jp'],
                          lang => 9,
                          review_number => 100,
                          review_order => 4
                         );

warn Dumper $info;

とかやると

$VAR1 = {
          '404732112' => {
                           'jp' => {
                                     'review_number' => 100,
                                     'total_rank' => 38,
                                     'store_code' => 143462,
                                     'app_name' => 'Sleipnir Mobile - Web Browser',
                                     'genre_rank' => 4,
                                     'store_name' => 'Japan',
                                     'artist_id' => 318578225,
                                     'genre_name' => 'Utilities',
                                     'ranks' => {
                                                'genre_rank_ipad' => 1,
                                                'total_rank_ipad' => 3,
                                                'total_rank_iphone' => 38,
                                                'genre_rank_iphone' => 4
                                              },
                                     'lang' => 9,
                                     'reviews' => [
						   {
                                                    'date' => '22-Apr-2011',
                                                    'message' => "....."
						   },
 .
 .
 .
                                                ],
                                     'ident' => 'both',
                                     'genre_id' => 6002,
                                     'review_order' => 4,
                                     'ratings' => {
                                                  'userRatingCount' => 213,
                                                  'averageUserRatingForCurrentVersion' => '3.5',
                                                  'averageUserRating' => '3.5',
                                                  'userRatingCountForCurrentVersion' => 34
                                                },
                                     'price' => '0'
                                   }
                         }
        };

こんな感じ

ソースコードは長いので、続きからどうぞ

package AppStore::Scraper;

use strict;
use utf8;
use warnings;
use Data::Dumper;

use LWP::UserAgent;
use XML::Simple;
use JSON;

sub new {
    my $class = shift;
    my @args = @_;
    my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};

    my $self = bless{}, ref $class || $class;

    $self->{__STORE_CODES} = _init_countries();
    $self->{__URL_PREF} = 'http://ax.itunes.apple.com/WebObjects/';
    $self->{__SCRAPING_URL_PREF}  = $self->{__URL_PREF} . 'MZStore.woa/wa/';
    $self->{__SEARCHAPI_URL_PREF} = $self->{__URL_PREF} . 'MZStoreServices.woa/wa/';
    $self->{__UA} = 'iTunes/9.1.1 (Macintosh; Intel Mac OS X 10.6.3';

    $self->{ua} = LWP::UserAgent->new();
    $self->{ua}->timeout(30);
    $self->{ua}->env_proxy;
    $self->{ua}->agent( $self->{__UA} );
    $self->{__WAIT} = $args_ref->{wait} || '1';

#    $self->{__XML_PREFERRED_PARSER} = 'XML::SAX::PurePerl';
    $self->{__XML_PREFERRED_PARSER} = 'XML::Parser';
#    $self->{__XML_PREFERRED_PARSER} = 'XML::SAX::Expat';
#    $self->{__XML_PREFERRED_PARSER} = 'XML::LibXML::SAX';

    $self;
}

sub app_info {
    my $self = shift;
    my @args = @_;

    # get info from app page
    my $base = $self->app_base_info( @args );

    my $ret = {};
    for my $app ( keys %$base ) {
        for my $store ( keys %{$base->{$app}} ) {
            my $info = $base->{$app}->{$store};
            next unless $info->{genre_id};

            my $genre_rank_iphone;
            my $total_rank_iphone;
            my $genre_rank_ipad;
            my $total_rank_ipad;

            if ( $info->{ident} ne 'ipad' ) {
                my $tmp = $info->{ident};
                $info->{ident} = 'iphone';
                $genre_rank_iphone = $self->genre_rank(
                                                       app => $app,
                                                       info => $info
                                                      );
                $total_rank_iphone = $self->total_rank(
                                                       app => $app,
                                                       info => $info
                                                      );
                $info->{ident} = $tmp;
            }
            if ( $info->{ident} ne 'iphone' ) {
                my $tmp = $info->{ident};
                $info->{ident} = 'ipad';
                $genre_rank_ipad = $self->genre_rank(
                                                     app => $app,
                                                     info => $info
                                                    );
                $total_rank_ipad = $self->total_rank(
                                                     app => $app,
                                                     info => $info
                                                    );
                $info->{ident} = $tmp;
            }
            my $reviews = $self->app_reviews(
                                             app => $app,
                                             info => $info
                                            );
            $ret->{$app}->{$store} = {
                                      %$info,
                                      genre_rank => $genre_rank_iphone,
                                      total_rank => $total_rank_iphone,
                                      ranks => {
                                                genre_rank_iphone => $genre_rank_iphone,
                                                total_rank_iphone => $total_rank_iphone,
                                                genre_rank_ipad => $genre_rank_ipad,
                                                total_rank_ipad => $total_rank_ipad,
                                               },
                                      reviews => $reviews,
                                      store_name => $self->{__STORE_CODES}->{$store}->{name},
                                     };
            sleep $self->{__WAIT};
        }
    }

    $ret;
}

sub app_base_info {
    my $self = shift;
    my @args = @_;

    my $args = $self->_validate_args(@args);

    my $ret = {};
    for my $app ( @{$args->{apps}} ) {
        for my $store ( keys %{$args->{stores}} ) {
            my $tmp;
            $tmp->{store_code} = $args->{stores}->{$store}->{code};
            $tmp->{lang} = $args->{lang};
            $tmp->{ident} = $args->{ident};
            $tmp->{review_number} = $args->{review_number};
            $tmp->{review_order} = $args->{review_order};

            my $store_tmp = $store eq 'uk' ? 'gb' : $store;
            my $uri = $self->{__SEARCHAPI_URL_PREF} . 'wsLookup?id='.$app.'&entity=software&country='.$store_tmp;
            my $res = $self->{ua}->get( $uri );
            # Error Check
            unless ( $res->is_success ) {
                warn 'request failed: ', $uri, ': ', $res->status_line, ': ', $store, '-', $args->{lang};
                next;
            }
            my $jsondata = $res->content;

            my $hash;
            my $json = JSON->new->utf8;
            eval { $hash = $json->decode($jsondata) };

            #
            # genre_id, artist_id, app_name, genre_name
            #

            $tmp->{genre_id}  = $hash->{results}->[0]->{primaryGenreId};
            $tmp->{artist_id} = $hash->{results}->[0]->{artistId};
            $tmp->{app_name} = $hash->{results}->[0]->{trackName};
            if ( $tmp->{app_name} ) {
                $tmp->{app_name} =~ s/^\s+(.*)\s+$/$1/;
            }
            $tmp->{genre_name}  = $hash->{results}->[0]->{primaryGenreName};

            #
            # price
            #

            $tmp->{price}  = $hash->{results}->[0]->{price};


            #
            # star
            #

            $tmp->{ratings} = {
                               averageUserRating => $hash->{results}->[0]->{averageUserRating},
                               userRatingCount => $hash->{results}->[0]->{userRatingCount},
                               averageUserRatingForCurrentVersion => $hash->{results}->[0]->{averageUserRatingForCurrentVersion},
                               userRatingCountForCurrentVersion => $hash->{results}->[0]->{userRatingCountForCurrentVersion},
                              };

            $ret->{$app}->{$store} = $tmp;
        }
    }
    $ret;
}


#
# for rank
#

sub genre_rank {
    my $self = shift;
    my @args = @_;

    $self->_get_rank(@args);
}

sub total_rank {
    my $self = shift;
    my @args = @_;

    $self->_get_rank(@args);
}

sub _rank_uri {
    my $self = shift;
    my $price = shift;
    my $ident = shift;

    # iphone 30:27, ipad 47:44
    my $popId = $price ? 30: 27;
    $popId += 17 if $ident eq 'ipad';
    my $uri = $self->{__SCRAPING_URL_PREF} . 'viewTop?id=25209&popId='. $popId;

    $uri;
}

sub _get_rank {
    my $self = shift;
    my @args = @_;

    my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
    my $caller =  (caller(1))[3];

    my $info;
    if ( $args_ref->{info} ) {
        $info = $args_ref->{info};
    }
    else {
        my $base_info = $self->app_base_info($args_ref);
        $info = $base_info->{ $args_ref->{app} }->{ $args_ref->{store} };
    }
    my $uri = $self->_rank_uri( $info->{price}, $info->{ident} );
    $uri .= '&genreId=' . $info->{genre_id} if $caller =~ /genre_rank$/;

    my $ret;

    my $xmlobj = $self->_get_xml($uri, $info->{store_code}, $info->{lang});
    my @arrays = split /\n+/, Dumper($xmlobj->{View}->{ScrollView}->{VBoxView}->{View});

    my $i;
    for ( @arrays ) {
        next unless /salableAdamId=(\d+)/;
        $i++;
        next unless $1 == $args_ref->{app};
        $ret = $i;
        last;
    }

    $ret;
}

#
# for reviews
#

sub app_reviews {
    my $self = shift;
    my @args = @_;

    my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
    my $ret = [];

    my $info;
    if ( $args_ref->{info} ) {
        $info = $args_ref->{info};
    }
    else {
        my $base_info = $self->app_base_info($args_ref);
        $info = $base_info->{ $args_ref->{app} }->{ $args_ref->{store} };
    }

    my $order = $info->{review_order};
    my $uri = $info->{review_url} || $self->{__SCRAPING_URL_PREF} . 'viewContentsUserReviews?pageNumber=0&type=Purple+Software&id='.$args_ref->{app}.'&sortOrdering='.$order;

    $uri =~ s|sortOrdering=\d+|sortOrdering=$order|x;

    # pagenation
    if ( $uri =~ /(?:\?|&)pageNumber=\d+/ ) {
        my $i = 0;
        while ( scalar(@$ret) <= $info->{review_number} ) {
            $uri =~ s|pageNumber=\d+|pageNumber=$i|;
            my $tmp = $self->_app_reviews($uri, $info->{store_code}, $info->{lang});
            last unless scalar(@$tmp);
            $ret = [@$ret, @$tmp];
            $i++;
        }
    }
    else {
        $ret = $self->_app_reviews($uri, $info->{store_code}, $info->{lang});
    }

    @$ret = splice @$ret, 0, $info->{review_number};

    $ret;
}

sub _app_reviews {
    my $self = shift;
    my $uri = shift;
    my $store_code = shift;
    my $lang = shift;

    my $ret = [];

    my $xmlobj = $self->_get_xml($uri, $store_code, $lang);
    my $treetmp = $xmlobj->{View}->{ScrollView}->{VBoxView}->{View}->{MatrixView}->{VBoxView}->[0]->{VBoxView}->{VBoxView};

    if ( ref $treetmp eq 'HASH' ) {
        my($date, $mes) = $self->_get_review_message( $treetmp );
        push @$ret, {
                     message => $mes,
                     date => $date,
                    };
    }
    elsif ( ref $treetmp eq 'ARRAY' ) {
        for ( @$treetmp ) {
            my($date, $mes) = $self->_get_review_message( $_ );
            push @$ret, {
                         message => $mes,
                         date => $date,
                        };
        }
    }

    $ret;
}

sub _get_review_message {
    my $self = shift;
    my $args = shift;

    my $mes = $args->{TextView}->{SetFontStyle}->{content};
    my $tmp = $args->{HBoxView}->[1]->{TextView}->{SetFontStyle}->{content} || '';
    my $datetmp = ref $tmp eq 'ARRAY' ? $tmp->[scalar(@$tmp) -1] : $tmp;
    my $date;
    if ( $datetmp ) {
        chomp $datetmp;
        my @tmps =  split /\n\s+/, $datetmp;
        $date = pop @tmps;
    }
    if ( ref $mes eq 'ARRAY' ) {
        $mes = join "\n", @{$mes};
    }

    return ($date, $mes);
}

#
# common
#

sub _validate_args {
    my $self = shift;
    my @args = @_;

    my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};

    #
    # prepare array by target apps
    #

    die 'app code MUST be needed' unless $args_ref->{app};

    my @appcode = ref $args_ref->{app} eq 'ARRAY' ? @{$args_ref->{app}}
        : ($args_ref->{app});
    for (@appcode) {
        die 'app code MUST be numerical: ',$_ unless m|^\d+$|;
    }
    my $apps_array = [@appcode];

    #
    # prepare array by target countries
    #

    my $stores_hash;
    if ( $args_ref->{store} ) {
        my @storename = ref $args_ref->{store} eq 'ARRAY' ? @{$args_ref->{store}}
            : ($args_ref->{store});
        for ( @storename ) {
            my $s = lc $_;
            if ( exists $self->{__STORE_CODES}->{ $s } ) {
                $stores_hash->{ $s } = $self->{__STORE_CODES}->{ $s };
            }
            else {
                die 'cannot found appstore on "', $s, '"';
            }
        }
    }
    else {
        $stores_hash = $self->{__STORE_CODES};
    }

    #
    # prepare digit by target lang
    #

    my $lang = ( exists $args_ref->{lang} and $args_ref->{lang} =~ /^\d+$/ ) ? $args_ref->{lang} : 1;

    #
    # prepare identifier
    #

    my $ident = 'both';
    if ( exists $args_ref->{ident} ) {
        if ( $args_ref->{ident} eq 'ipad' ) {
            $ident = 'ipad';
        }
        elsif ( $args_ref->{ident} eq 'iphone' ) {
            $ident = 'iphone';
        }
    }

    #
    # prepare reviews max number
    #

    my $review_number = ( exists $args_ref->{review_number} and $args_ref->{review_number} =~ /^\d+$/ ) ? $args_ref->{review_number} :25;

    #
    # prepare reviews order
    #     1..Most Helpful
    #     2..Most Favourable
    #     3..Most Critical
    #     4..Most Recent
    #

    my $review_order = ( exists $args_ref->{review_order} and $args_ref->{review_order} =~ /^\d+$/ ) ? $args_ref->{review_order} :1;

    return {
            apps => $apps_array,
            stores => $stores_hash,
            lang => $lang,
            ident => $ident,
            review_number => $review_number,
            review_order => $review_order,
           };
}

sub _get_xml {
    my $self = shift;
    my ($uri,$store,$lang) = @_;

    $self->{ua}->default_header('X-Apple-Store-Front' => $store . '-' . $lang);
    my $res = $self->{ua}->get( $uri );

    # Error Check
    unless ( $res->is_success ) {
        warn 'request failed: ', $uri, ': ', $res->status_line, ': ', $store, '-', $lang;
        next;
    }
    unless ( $res->headers->header('Content-Type') =~ m|/xml| ) {
        warn 'content is not xml: ', $uri, ': ', $res->headers->header('Content-Type'), ': ', $store, '-', $lang;
        next;
    }
    local $XML::Simple::PREFERRED_PARSER = $self->{__XML_PREFERRED_PARSER};
    my $xmlobj = XMLin( $res->content );

    $xmlobj;
}

sub _init_countries {

    my $c = {
             jp => {
                    name => 'Japan',
                    code => 143462,
                   },
             us => {
                    name => 'United States',
                    code => 143441,
                   },
             ar => {
                    name => 'Argentine',
                    code => 143505,
                   },
             au => {
                    name => 'Autstralia',
                    code => 143460,
                   },
             be => {
                    name => 'Belgium',
                    code => 143446,
                   },
             br => {
                    name => 'Brazil',
                    code => 143503,
                   },
             ca => {
                    name => 'Canada',
                    code => 143455,
                   },
             cl => {
                    name => 'Chile',
                    code => 143483,
                   },
             cn => {
                    name => 'China',
                    code => 143465,
                   },
             co => {
                    name => 'Colombia',
                    code => 143501,
                   },
             cr => {
                    name => 'Costa Rica',
                    code => 143495,
                   },
             hr => {
                    name => 'Croatia',
                    code => 143494,
                   },
             cz => {
                    name => 'Czech Republic',
                    code => 143489,
                   },
             dk => {
                    name => 'Denmark',
                    code => 143458,
                   },
             de => {
                    name => 'Germany',
                    code => 143443,
                   },
             sv => {
                    name => 'El Salvador',
                    code => 143506,
                   },
             es => {
                    name => 'Spain',
                    code => 143454,
                   },
             fi => {
                    name => 'Finland',
                    code => 143447,
                   },
             fr => {
                    name => 'France',
                    code => 143442,
                   },
             gr => {
                    name => 'Greece',
                    code => 143448,
                   },
             gt => {
                    name => 'Guatemala',
                    code => 143504,
                   },
             hk => {
                    name => 'Hong Kong',
                    code => 143463,
                   },
             hu => {
                    name => 'Hungary',
                    code => 143482,
                   },
             in => {
                    name => 'India',
                    code => 143467,
                   },
             id => {
                    name => 'Indonesia',
                    code => 143476,
                   },
             ie => {
                    name => 'Ireland',
                    code => 143449,
                   },
             il => {
                    name => 'Israel',
                    code => 143491,
                   },
             it => {
                    name => 'Italia',
                    code => 143450,
                   },
             kr => {
                    name => 'Korea',
                    code => 143466,
                   },
             kw => {
                    name => 'Kuwait',
                    code => 143493,
                   },
             lb => {
                    name => 'Lebanon',
                    code => 143497,
                   },
             lu => {
                    name => 'Luxembourg',
                    code => 143451,
                   },
             my => {
                    name => 'Malaysia',
                    code => 143473,
                   },
             mx => {
                    name => 'Mexico',
                    code => 143468,
                   },
             nl => {
                    name => 'Nederland',
                    code => 143452,
                   },
             nu => {
                    name => 'New Zealand',
                    code => 143461,
                   },
             no => {
                    name => 'Norway',
                    code => 143457,
                   },
             at => {
                    name => 'Osterreich',
                    code => 143445,
                   },
             pk => {
                    name => 'Pakistan',
                    code => 143477,
                   },
             pa => {
                    name => 'Panama',
                    code => 143485,
                   },
             pe => {
                    name => 'Peru',
                    code => 143507,
                   },
             ph => {
                    name => 'Phillipines',
                    code => 143474,
                   },
             pl => {
                    name => 'Poland',
                    code => 143478,
                   },
             pt => {
                    name => 'Portugal',
                    code => 143453,
                   },
             qa => {
                    name => 'Qatar',
                    code => 143498,
                   },
             ro => {
                    name => 'Romania',
                    code => 143487,
                   },
             ru => {
                    name => 'Russia',
                    code => 143469,
                   },
             sa => {
                    name => 'Saudi Arabia',
                    code => 143479,
                   },
             ch => {
                    name => 'Switzerland',
                    code => 143459,
                   },
             sg => {
                    name => 'Singapore',
                    code => 143464,
                   },
             sk => {
                    name => 'Slovakia',
                    code => 143496,
                   },
             si => {
                    name => 'Slovenia',
                    code => 143499,
                   },
             za => {
                    name => 'South Africa',
                    code => 143472,
                   },
             lk => {
                    name => 'Sri Lanka',
                    code => 143486,
                   },
             se => {
                    name => 'Sweden',
                    code => 143456,
                   },
             tw => {
                    name => 'Taiwan',
                    code => 143470,
                   },
             th => {
                    name => 'Thailand',
                    code => 143475,
                   },
             tr => {
                    name => 'Turkey',
                    code => 143480,
                   },
             ae => {
                    name => 'United Arab Emirates',
                    code => 143481,
                   },
             uk => {
                    name => 'United Kingdom',
                    code => 143444,
                   },
             ve => {
                    name => 'Venezuela',
                    code => 143502,
                   },
             vn => {
                    name => 'Vietnam',
                    code => 143471,
                   },
            };
}

1;

See Also

Copyright © 髭。/ Hugo + hugo-book