AppStore::Scraper アップデート #
この前の iTunes のアップデートのタイミングで、また AppStore::Scraper
が動かなくなった
スクレイピングはこれがあるからなぁ。。。。仕方ないので adhoc に書きなおした
今までと同じように使えるけど、レート毎の件数(☆ 1 つが何件とか)がとれなくなってます
そのかわりってわけじゃないが、
- 件数とアベレージは取得できる
- iPhone でも iPad でも動くアプリは、両方のランキング取得できる
今回から Search API 使ってます。今後はスクレイピングは最小限にして、Seach API でとれるものをベースに書き換える予定。。。だが、動いてるうちは書き換えんやろなぁ
前と変わらないけど、使い方
use AppStore::Scraper;
use Data::Dumper;
my $obj = AppStore::Scraper->new(wait => 5);
my $info = $obj->app_info(
app => ['404732112'],
store => ['jp'],
lang => 9,
review_number => 100,
review_order => 4
);
warn Dumper $info;
とかやると
$VAR1 = {
'404732112' => {
'jp' => {
'review_number' => 100,
'total_rank' => 38,
'store_code' => 143462,
'app_name' => 'Sleipnir Mobile - Web Browser',
'genre_rank' => 4,
'store_name' => 'Japan',
'artist_id' => 318578225,
'genre_name' => 'Utilities',
'ranks' => {
'genre_rank_ipad' => 1,
'total_rank_ipad' => 3,
'total_rank_iphone' => 38,
'genre_rank_iphone' => 4
},
'lang' => 9,
'reviews' => [
{
'date' => '22-Apr-2011',
'message' => "....."
},
.
.
.
],
'ident' => 'both',
'genre_id' => 6002,
'review_order' => 4,
'ratings' => {
'userRatingCount' => 213,
'averageUserRatingForCurrentVersion' => '3.5',
'averageUserRating' => '3.5',
'userRatingCountForCurrentVersion' => 34
},
'price' => '0'
}
}
};
こんな感じ
ソースコードは長いので、続きからどうぞ
package AppStore::Scraper;
use strict;
use utf8;
use warnings;
use Data::Dumper;
use LWP::UserAgent;
use XML::Simple;
use JSON;
sub new {
my $class = shift;
my @args = @_;
my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
my $self = bless{}, ref $class || $class;
$self->{__STORE_CODES} = _init_countries();
$self->{__URL_PREF} = 'http://ax.itunes.apple.com/WebObjects/';
$self->{__SCRAPING_URL_PREF} = $self->{__URL_PREF} . 'MZStore.woa/wa/';
$self->{__SEARCHAPI_URL_PREF} = $self->{__URL_PREF} . 'MZStoreServices.woa/wa/';
$self->{__UA} = 'iTunes/9.1.1 (Macintosh; Intel Mac OS X 10.6.3';
$self->{ua} = LWP::UserAgent->new();
$self->{ua}->timeout(30);
$self->{ua}->env_proxy;
$self->{ua}->agent( $self->{__UA} );
$self->{__WAIT} = $args_ref->{wait} || '1';
# $self->{__XML_PREFERRED_PARSER} = 'XML::SAX::PurePerl';
$self->{__XML_PREFERRED_PARSER} = 'XML::Parser';
# $self->{__XML_PREFERRED_PARSER} = 'XML::SAX::Expat';
# $self->{__XML_PREFERRED_PARSER} = 'XML::LibXML::SAX';
$self;
}
sub app_info {
my $self = shift;
my @args = @_;
# get info from app page
my $base = $self->app_base_info( @args );
my $ret = {};
for my $app ( keys %$base ) {
for my $store ( keys %{$base->{$app}} ) {
my $info = $base->{$app}->{$store};
next unless $info->{genre_id};
my $genre_rank_iphone;
my $total_rank_iphone;
my $genre_rank_ipad;
my $total_rank_ipad;
if ( $info->{ident} ne 'ipad' ) {
my $tmp = $info->{ident};
$info->{ident} = 'iphone';
$genre_rank_iphone = $self->genre_rank(
app => $app,
info => $info
);
$total_rank_iphone = $self->total_rank(
app => $app,
info => $info
);
$info->{ident} = $tmp;
}
if ( $info->{ident} ne 'iphone' ) {
my $tmp = $info->{ident};
$info->{ident} = 'ipad';
$genre_rank_ipad = $self->genre_rank(
app => $app,
info => $info
);
$total_rank_ipad = $self->total_rank(
app => $app,
info => $info
);
$info->{ident} = $tmp;
}
my $reviews = $self->app_reviews(
app => $app,
info => $info
);
$ret->{$app}->{$store} = {
%$info,
genre_rank => $genre_rank_iphone,
total_rank => $total_rank_iphone,
ranks => {
genre_rank_iphone => $genre_rank_iphone,
total_rank_iphone => $total_rank_iphone,
genre_rank_ipad => $genre_rank_ipad,
total_rank_ipad => $total_rank_ipad,
},
reviews => $reviews,
store_name => $self->{__STORE_CODES}->{$store}->{name},
};
sleep $self->{__WAIT};
}
}
$ret;
}
sub app_base_info {
my $self = shift;
my @args = @_;
my $args = $self->_validate_args(@args);
my $ret = {};
for my $app ( @{$args->{apps}} ) {
for my $store ( keys %{$args->{stores}} ) {
my $tmp;
$tmp->{store_code} = $args->{stores}->{$store}->{code};
$tmp->{lang} = $args->{lang};
$tmp->{ident} = $args->{ident};
$tmp->{review_number} = $args->{review_number};
$tmp->{review_order} = $args->{review_order};
my $store_tmp = $store eq 'uk' ? 'gb' : $store;
my $uri = $self->{__SEARCHAPI_URL_PREF} . 'wsLookup?id='.$app.'&entity=software&country='.$store_tmp;
my $res = $self->{ua}->get( $uri );
# Error Check
unless ( $res->is_success ) {
warn 'request failed: ', $uri, ': ', $res->status_line, ': ', $store, '-', $args->{lang};
next;
}
my $jsondata = $res->content;
my $hash;
my $json = JSON->new->utf8;
eval { $hash = $json->decode($jsondata) };
#
# genre_id, artist_id, app_name, genre_name
#
$tmp->{genre_id} = $hash->{results}->[0]->{primaryGenreId};
$tmp->{artist_id} = $hash->{results}->[0]->{artistId};
$tmp->{app_name} = $hash->{results}->[0]->{trackName};
if ( $tmp->{app_name} ) {
$tmp->{app_name} =~ s/^\s+(.*)\s+$/$1/;
}
$tmp->{genre_name} = $hash->{results}->[0]->{primaryGenreName};
#
# price
#
$tmp->{price} = $hash->{results}->[0]->{price};
#
# star
#
$tmp->{ratings} = {
averageUserRating => $hash->{results}->[0]->{averageUserRating},
userRatingCount => $hash->{results}->[0]->{userRatingCount},
averageUserRatingForCurrentVersion => $hash->{results}->[0]->{averageUserRatingForCurrentVersion},
userRatingCountForCurrentVersion => $hash->{results}->[0]->{userRatingCountForCurrentVersion},
};
$ret->{$app}->{$store} = $tmp;
}
}
$ret;
}
#
# for rank
#
sub genre_rank {
my $self = shift;
my @args = @_;
$self->_get_rank(@args);
}
sub total_rank {
my $self = shift;
my @args = @_;
$self->_get_rank(@args);
}
sub _rank_uri {
my $self = shift;
my $price = shift;
my $ident = shift;
# iphone 30:27, ipad 47:44
my $popId = $price ? 30: 27;
$popId += 17 if $ident eq 'ipad';
my $uri = $self->{__SCRAPING_URL_PREF} . 'viewTop?id=25209&popId='. $popId;
$uri;
}
sub _get_rank {
my $self = shift;
my @args = @_;
my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
my $caller = (caller(1))[3];
my $info;
if ( $args_ref->{info} ) {
$info = $args_ref->{info};
}
else {
my $base_info = $self->app_base_info($args_ref);
$info = $base_info->{ $args_ref->{app} }->{ $args_ref->{store} };
}
my $uri = $self->_rank_uri( $info->{price}, $info->{ident} );
$uri .= '&genreId=' . $info->{genre_id} if $caller =~ /genre_rank$/;
my $ret;
my $xmlobj = $self->_get_xml($uri, $info->{store_code}, $info->{lang});
my @arrays = split /\n+/, Dumper($xmlobj->{View}->{ScrollView}->{VBoxView}->{View});
my $i;
for ( @arrays ) {
next unless /salableAdamId=(\d+)/;
$i++;
next unless $1 == $args_ref->{app};
$ret = $i;
last;
}
$ret;
}
#
# for reviews
#
sub app_reviews {
my $self = shift;
my @args = @_;
my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
my $ret = [];
my $info;
if ( $args_ref->{info} ) {
$info = $args_ref->{info};
}
else {
my $base_info = $self->app_base_info($args_ref);
$info = $base_info->{ $args_ref->{app} }->{ $args_ref->{store} };
}
my $order = $info->{review_order};
my $uri = $info->{review_url} || $self->{__SCRAPING_URL_PREF} . 'viewContentsUserReviews?pageNumber=0&type=Purple+Software&id='.$args_ref->{app}.'&sortOrdering='.$order;
$uri =~ s|sortOrdering=\d+|sortOrdering=$order|x;
# pagenation
if ( $uri =~ /(?:\?|&)pageNumber=\d+/ ) {
my $i = 0;
while ( scalar(@$ret) <= $info->{review_number} ) {
$uri =~ s|pageNumber=\d+|pageNumber=$i|;
my $tmp = $self->_app_reviews($uri, $info->{store_code}, $info->{lang});
last unless scalar(@$tmp);
$ret = [@$ret, @$tmp];
$i++;
}
}
else {
$ret = $self->_app_reviews($uri, $info->{store_code}, $info->{lang});
}
@$ret = splice @$ret, 0, $info->{review_number};
$ret;
}
sub _app_reviews {
my $self = shift;
my $uri = shift;
my $store_code = shift;
my $lang = shift;
my $ret = [];
my $xmlobj = $self->_get_xml($uri, $store_code, $lang);
my $treetmp = $xmlobj->{View}->{ScrollView}->{VBoxView}->{View}->{MatrixView}->{VBoxView}->[0]->{VBoxView}->{VBoxView};
if ( ref $treetmp eq 'HASH' ) {
my($date, $mes) = $self->_get_review_message( $treetmp );
push @$ret, {
message => $mes,
date => $date,
};
}
elsif ( ref $treetmp eq 'ARRAY' ) {
for ( @$treetmp ) {
my($date, $mes) = $self->_get_review_message( $_ );
push @$ret, {
message => $mes,
date => $date,
};
}
}
$ret;
}
sub _get_review_message {
my $self = shift;
my $args = shift;
my $mes = $args->{TextView}->{SetFontStyle}->{content};
my $tmp = $args->{HBoxView}->[1]->{TextView}->{SetFontStyle}->{content} || '';
my $datetmp = ref $tmp eq 'ARRAY' ? $tmp->[scalar(@$tmp) -1] : $tmp;
my $date;
if ( $datetmp ) {
chomp $datetmp;
my @tmps = split /\n\s+/, $datetmp;
$date = pop @tmps;
}
if ( ref $mes eq 'ARRAY' ) {
$mes = join "\n", @{$mes};
}
return ($date, $mes);
}
#
# common
#
sub _validate_args {
my $self = shift;
my @args = @_;
my $args_ref = ref $args[0] eq 'HASH' ? $args[0] : {@args};
#
# prepare array by target apps
#
die 'app code MUST be needed' unless $args_ref->{app};
my @appcode = ref $args_ref->{app} eq 'ARRAY' ? @{$args_ref->{app}}
: ($args_ref->{app});
for (@appcode) {
die 'app code MUST be numerical: ',$_ unless m|^\d+$|;
}
my $apps_array = [@appcode];
#
# prepare array by target countries
#
my $stores_hash;
if ( $args_ref->{store} ) {
my @storename = ref $args_ref->{store} eq 'ARRAY' ? @{$args_ref->{store}}
: ($args_ref->{store});
for ( @storename ) {
my $s = lc $_;
if ( exists $self->{__STORE_CODES}->{ $s } ) {
$stores_hash->{ $s } = $self->{__STORE_CODES}->{ $s };
}
else {
die 'cannot found appstore on "', $s, '"';
}
}
}
else {
$stores_hash = $self->{__STORE_CODES};
}
#
# prepare digit by target lang
#
my $lang = ( exists $args_ref->{lang} and $args_ref->{lang} =~ /^\d+$/ ) ? $args_ref->{lang} : 1;
#
# prepare identifier
#
my $ident = 'both';
if ( exists $args_ref->{ident} ) {
if ( $args_ref->{ident} eq 'ipad' ) {
$ident = 'ipad';
}
elsif ( $args_ref->{ident} eq 'iphone' ) {
$ident = 'iphone';
}
}
#
# prepare reviews max number
#
my $review_number = ( exists $args_ref->{review_number} and $args_ref->{review_number} =~ /^\d+$/ ) ? $args_ref->{review_number} :25;
#
# prepare reviews order
# 1..Most Helpful
# 2..Most Favourable
# 3..Most Critical
# 4..Most Recent
#
my $review_order = ( exists $args_ref->{review_order} and $args_ref->{review_order} =~ /^\d+$/ ) ? $args_ref->{review_order} :1;
return {
apps => $apps_array,
stores => $stores_hash,
lang => $lang,
ident => $ident,
review_number => $review_number,
review_order => $review_order,
};
}
sub _get_xml {
my $self = shift;
my ($uri,$store,$lang) = @_;
$self->{ua}->default_header('X-Apple-Store-Front' => $store . '-' . $lang);
my $res = $self->{ua}->get( $uri );
# Error Check
unless ( $res->is_success ) {
warn 'request failed: ', $uri, ': ', $res->status_line, ': ', $store, '-', $lang;
next;
}
unless ( $res->headers->header('Content-Type') =~ m|/xml| ) {
warn 'content is not xml: ', $uri, ': ', $res->headers->header('Content-Type'), ': ', $store, '-', $lang;
next;
}
local $XML::Simple::PREFERRED_PARSER = $self->{__XML_PREFERRED_PARSER};
my $xmlobj = XMLin( $res->content );
$xmlobj;
}
sub _init_countries {
my $c = {
jp => {
name => 'Japan',
code => 143462,
},
us => {
name => 'United States',
code => 143441,
},
ar => {
name => 'Argentine',
code => 143505,
},
au => {
name => 'Autstralia',
code => 143460,
},
be => {
name => 'Belgium',
code => 143446,
},
br => {
name => 'Brazil',
code => 143503,
},
ca => {
name => 'Canada',
code => 143455,
},
cl => {
name => 'Chile',
code => 143483,
},
cn => {
name => 'China',
code => 143465,
},
co => {
name => 'Colombia',
code => 143501,
},
cr => {
name => 'Costa Rica',
code => 143495,
},
hr => {
name => 'Croatia',
code => 143494,
},
cz => {
name => 'Czech Republic',
code => 143489,
},
dk => {
name => 'Denmark',
code => 143458,
},
de => {
name => 'Germany',
code => 143443,
},
sv => {
name => 'El Salvador',
code => 143506,
},
es => {
name => 'Spain',
code => 143454,
},
fi => {
name => 'Finland',
code => 143447,
},
fr => {
name => 'France',
code => 143442,
},
gr => {
name => 'Greece',
code => 143448,
},
gt => {
name => 'Guatemala',
code => 143504,
},
hk => {
name => 'Hong Kong',
code => 143463,
},
hu => {
name => 'Hungary',
code => 143482,
},
in => {
name => 'India',
code => 143467,
},
id => {
name => 'Indonesia',
code => 143476,
},
ie => {
name => 'Ireland',
code => 143449,
},
il => {
name => 'Israel',
code => 143491,
},
it => {
name => 'Italia',
code => 143450,
},
kr => {
name => 'Korea',
code => 143466,
},
kw => {
name => 'Kuwait',
code => 143493,
},
lb => {
name => 'Lebanon',
code => 143497,
},
lu => {
name => 'Luxembourg',
code => 143451,
},
my => {
name => 'Malaysia',
code => 143473,
},
mx => {
name => 'Mexico',
code => 143468,
},
nl => {
name => 'Nederland',
code => 143452,
},
nu => {
name => 'New Zealand',
code => 143461,
},
no => {
name => 'Norway',
code => 143457,
},
at => {
name => 'Osterreich',
code => 143445,
},
pk => {
name => 'Pakistan',
code => 143477,
},
pa => {
name => 'Panama',
code => 143485,
},
pe => {
name => 'Peru',
code => 143507,
},
ph => {
name => 'Phillipines',
code => 143474,
},
pl => {
name => 'Poland',
code => 143478,
},
pt => {
name => 'Portugal',
code => 143453,
},
qa => {
name => 'Qatar',
code => 143498,
},
ro => {
name => 'Romania',
code => 143487,
},
ru => {
name => 'Russia',
code => 143469,
},
sa => {
name => 'Saudi Arabia',
code => 143479,
},
ch => {
name => 'Switzerland',
code => 143459,
},
sg => {
name => 'Singapore',
code => 143464,
},
sk => {
name => 'Slovakia',
code => 143496,
},
si => {
name => 'Slovenia',
code => 143499,
},
za => {
name => 'South Africa',
code => 143472,
},
lk => {
name => 'Sri Lanka',
code => 143486,
},
se => {
name => 'Sweden',
code => 143456,
},
tw => {
name => 'Taiwan',
code => 143470,
},
th => {
name => 'Thailand',
code => 143475,
},
tr => {
name => 'Turkey',
code => 143480,
},
ae => {
name => 'United Arab Emirates',
code => 143481,
},
uk => {
name => 'United Kingdom',
code => 143444,
},
ve => {
name => 'Venezuela',
code => 143502,
},
vn => {
name => 'Vietnam',
code => 143471,
},
};
}
1;