2017/10/16

無聊寫的,抓某網站圖片的程式 get.pl

#!/usr/bin/perl
use strict;
use warnings;
use Mojo::UserAgent;
use Mojo::URL;
use utf8;
use File::Basename;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');

my ($fp_pics, $fp_extracted_links);
open $fp_pics, '<', "pics.txt";
binmode($fp_pics, ':utf8');
my @pics = <$fp_pics>;
chomp @pics;
close $fp_pics;
my $time = localtime;
push @pics, "$time";

my @extracted_links;
if (open $fp_extracted_links, '<', "extracted_links.txt") {
  binmode($fp_extracted_links, ':utf8');
  @extracted_links = <$fp_extracted_links>;
  chomp @extracted_links;
  close $fp_extracted_links;
} else {
  @extracted_links = ();
}

##starting URL
my @urls = (
  "http://www.xiuren.org/tag/%E8%AF%B1%E6%83%91.html",
  "http://www.xiuren.org/tag/%E7%A7%81%E6%88%BF.html",
  "http://www.xiuren.org/tag/%E6%80%A7%E6%84%9F.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E6%B3%A2.html",
  "http://www.xiuren.org/tag/%E7%BE%8E%E5%A5%B3.html",
  "http://www.xiuren.org/tag/%E5%85%BB%E7%9C%BC.html",
  "http://www.xiuren.org/tag/%E7%99%BD%E8%99%8E.html",
  "http://www.xiuren.org/tag/%E5%86%99%E7%9C%9F.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E8%83%B8.html",
  "http://www.xiuren.org/tag/%E6%92%A9%E4%BA%BA.html",
  "http://www.xiuren.org/tag/%E6%91%A9%E6%93%A6%E6%91%A9%E6%93%A6.html",
  "http://www.xiuren.org/tag/%E6%9B%BC%E5%A6%99.html",
  "http://www.xiuren.org/tag/%E7%9C%9F%E7%A9%BA.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E5%A5%B6.html",
  "http://www.xiuren.org/tag/%E5%86%85%E8%A1%A3.html",
  "http://www.xiuren.org/tag/%E6%8C%BA%E6%8B%94.html",
  "http://www.xiuren.org/tag/%E9%A5%B1%E6%BB%A1.html",
  "http://www.xiuren.org/tag/%E7%BE%8E%E4%B9%B3.html",
  "http://www.xiuren.org/tag/%E6%A1%83%E4%B9%B3.html",
  "http://www.xiuren.org/tag/%E4%B9%8B%E5%90%8E%E5%B9%B2%E4%BA%86%E4%B8%AA%E7%88%BD.html",
  "http://www.xiuren.org/tag/%E8%83%B4%E4%BD%93.html",
  "http://www.xiuren.org/tag/%E8%95%BE%E4%B8%9D.html",
  "http://www.xiuren.org/tag/%E5%B0%8F%E7%99%BD%E5%85%94.html",
  "http://www.xiuren.org/tag/MyGirl.html",
  "http://www.xiuren.org/category/toutiaogirls.html",
  "http://www.xiuren.org/category/miitao.html",
  "http://www.xiuren.org/category/youwu.html",
  "http://www.xiuren.org/category/TuiGirl.html",
  "http://www.xiuren.org/category/ugirls.html",
  "http://www.xiuren.org/category/tgod.html",
  "http://www.xiuren.org/category/bololi.html",
  "http://www.xiuren.org/category/imiss.html",
  "http://www.xiuren.org/category/mistar.html",
  "http://www.xiuren.org/category/feilin.html",
  "http://www.xiuren.org/category/mfstar.html",
  "http://www.xiuren.org/category/vgirlmm.html",
  "http://www.xiuren.org/category/rayshen.html",
  "http://www.xiuren.org/category/uxing.html",
  "http://www.xiuren.org/category/AISS.html",
  "http://www.xiuren.org/category/ru1mm.html",
  "http://www.xiuren.org/category/donggan.html",
  "http://www.xiuren.org/category/XiuRen.html"
);

my $ua = Mojo::UserAgent->new;
my %visited;

while (@urls) {
  my $url = shift @urls;
  next if exists $visited{$url};
  $visited{$url} = 1;

  print "[$url]\n";

  my $tx = $ua->get($url)->res;
  if ($tx->dom('span=photoThum]')) {
    $tx->dom('span[class=photoThum]')->each(sub{
      my ($match_url) = ( m/<a href="([^"]*)" / );
      if ($match_url) {
        my ($site, $album, $file) = ($match_url =~ m/http:\/\/[^\/]*\/([^\/]*)\/([^\/]*)\/(.*)/);
        my ($f, $p, $s) = fileparse($file);
        if ($site && $album && $file && ! grep (/^$site\/$album\/$file/, @pics)) {
          mkdir "pics"; mkdir "pics/$site"; mkdir "pics/$site/$album";
          mkdir "pics/$site/$album/$p" if $p ne '.';
          print "getting $site/$album/$file...";
          my $ua_gif = Mojo::UserAgent->new(max_redirects => 5);
          my $gif = $ua_gif->get($match_url);
          $gif->res->content->asset->move_to("pics/$site/$album/$file");
          push @pics, "$site/$album/$file";
          print "done\n";
        }
      }
    });
  }
  if ($tx->dom('div[class=content]')) {
    $tx->dom('div[class=content]')->each(sub{
     
my ($match_url) = ( m/<a href="([^"]*)" / );
      if ($match_url && !grep (m/$match_url/, @extracted_links)) {
        push (@urls, $match_url);
        push (@extracted_links, $match_url);
      }
    });
  }
}

open $fp_pics, '>', "pics.txt";
binmode($fp_pics, ':utf8');
print $fp_pics join("\n", @pics), "\n";
close $fp_pics;

open $fp_extracted_links, '>', "extracted_links.txt";
binmode($fp_extracted_links, ':utf8');
print $fp_extracted_links join("\n", sort @extracted_links), "\n";
close $fp_extracted_links;

1 意見:

菠蘿麵包 提到...

底下是改寫成多執行緒的例子,速度快很多:

#!/usr/bin/perl
use strict;
use warnings;
use Mojo::UserAgent;
use Mojo::URL;
use utf8;
use File::Basename;
use threads;

binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');

my ($fp_pics, $fp_extracted_links, @pics);
open $fp_pics, '<', "pics.txt" and binmode($fp_pics, ':utf8') and @pics = <$fp_pics> and close $fp_pics;
chomp @pics;

my $time = localtime;
push @pics, "$time";

my @extracted_links = ();
if (open $fp_extracted_links, '<', "extracted_links.txt") {
binmode($fp_extracted_links, ':utf8');
@extracted_links = <$fp_extracted_links>;
chomp @extracted_links;
close $fp_extracted_links;
}

open $fp_pics, '>', "pics.txt" and binmode($fp_pics, ':utf8');
open $fp_extracted_links, '>', "extracted_links.txt" and binmode($fp_extracted_links, ':utf8');

##starting URL
my @urls = (
"http://www.xiuren.org/tag/MyGirl.html",
"http://www.xiuren.org/category/toutiaogirls.html",
"http://www.xiuren.org/category/miitao.html",
"http://www.xiuren.org/category/youwu.html",
"http://www.xiuren.org/category/TuiGirl.html",
"http://www.xiuren.org/category/ugirls.html",
"http://www.xiuren.org/category/tgod.html",
"http://www.xiuren.org/category/bololi.html",
"http://www.xiuren.org/category/imiss.html",
"http://www.xiuren.org/category/mistar.html",
"http://www.xiuren.org/category/feilin.html",
"http://www.xiuren.org/category/mfstar.html",
"http://www.xiuren.org/category/vgirlmm.html",
"http://www.xiuren.org/category/rayshen.html",
"http://www.xiuren.org/category/uxing.html",
"http://www.xiuren.org/category/AISS.html",
"http://www.xiuren.org/category/ru1mm.html",
"http://www.xiuren.org/category/donggan.html",
"http://www.xiuren.org/category/XiuRen.html"
);

my $ua = Mojo::UserAgent->new;
my %visited;

my @threads = ();

while (@urls) {
my $url = shift @urls;
print "[$url]\n";
push (@threads, threads->new (\&getByUrl, $url));
}
foreach (@threads) {
$_->join();
}

sub getByUrl {
my ($url) = @_;
next if exists $visited{$url};
$visited{$url} = 1;
my @picUrls = ();

print "<$url>\n";
my $tx = $ua->get($url)->res;
if ($tx->dom('span=photoThum]')) {
$tx->dom('span[class=photoThum]')->each(sub{
my ($match_url) = ( m/<a href="([^"]*)" / );
if ($match_url) {
my ($site, $album, $file) = ($match_url =~ m/http:\/\/[^\/]*\/([^\/]*)\/([^\/]*)\/(.*)/);
my ($f, $p, $s) = fileparse($file);
if ($site && $album && $file && ! grep (/^$site\/$album\/$file/, @pics)) {
mkdir "pics"; mkdir "pics/$site"; mkdir "pics/$site/$album";
mkdir "pics/$site/$album/$p" if $p ne '.';
#print "getting $site/$album/$file...";
my $ua_gif = Mojo::UserAgent->new(max_redirects => 5);
my $gif = $ua_gif->get($match_url);
$gif->res->content->asset->move_to("pics/$site/$album/$file");
print $fp_pics "$site/$album/$file\n";
#print "done\n";
}
}
});
}
if ($tx->dom('div[class=content]')) {
$tx->dom('div[class=content]')->each(sub{
my ($match_url) = ( m/<a href="([^"]*)" / );
if ($match_url && !grep (m/$match_url/, @extracted_links)) {
print $fp_extracted_links "$match_url\n";
getByUrl($match_url);
}
});
}
}

close $fp_pics;
close $fp_extracted_links;