HTML::TreeBuilder::Select を使ってみる #
use strict;
use warnings;
use LWP::Simple;
use HTML::TreeBuilder::Select;
use Encode;
use Encode::JP;
use Encode::Guess qw(euc-jp shiftjis 7bit-jis);
my $text = LWP::Simple::get(shift);
my $enc = Encode::Guess::guess_encoding($text);
if ( ref $enc ) {
$text = $enc->decode($text);
}
else {
$text = Encode::decode('euc-jp', $text);
}
my $tree = HTML::TreeBuilder::Select->new;
$tree->parse_content($text);
my @sections = $tree->select('div.section');
for my $sect (@sections) {
print Encode::encode('euc-jp', $sect->as_text),"\n\n";
}
$tree->delete;
こんな感じ
- なぜ「div.section」なのかというと、狙いは tDiary 。tDiary とテーマ互換のもの、多いし。はてなとか
- なぜ「euc-jp」なのかというと以下略。