2017/10/16

無聊寫的,抓某網站圖片的程式 get.pl

#!/usr/bin/perl
use strict;
use warnings;
use Mojo::UserAgent;
use Mojo::URL;
use utf8;
use File::Basename;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');

my ($fp_pics, $fp_extracted_links);
open $fp_pics, '<', "pics.txt";
binmode($fp_pics, ':utf8');
my @pics = <$fp_pics>;
chomp @pics;
close $fp_pics;
my $time = localtime;
push @pics, "$time";

my @extracted_links;
if (open $fp_extracted_links, '<', "extracted_links.txt") {
  binmode($fp_extracted_links, ':utf8');
  @extracted_links = <$fp_extracted_links>;
  chomp @extracted_links;
  close $fp_extracted_links;
} else {
  @extracted_links = ();
}

##starting URL
my @urls = (
  "http://www.xiuren.org/tag/%E8%AF%B1%E6%83%91.html",
  "http://www.xiuren.org/tag/%E7%A7%81%E6%88%BF.html",
  "http://www.xiuren.org/tag/%E6%80%A7%E6%84%9F.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E6%B3%A2.html",
  "http://www.xiuren.org/tag/%E7%BE%8E%E5%A5%B3.html",
  "http://www.xiuren.org/tag/%E5%85%BB%E7%9C%BC.html",
  "http://www.xiuren.org/tag/%E7%99%BD%E8%99%8E.html",
  "http://www.xiuren.org/tag/%E5%86%99%E7%9C%9F.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E8%83%B8.html",
  "http://www.xiuren.org/tag/%E6%92%A9%E4%BA%BA.html",
  "http://www.xiuren.org/tag/%E6%91%A9%E6%93%A6%E6%91%A9%E6%93%A6.html",
  "http://www.xiuren.org/tag/%E6%9B%BC%E5%A6%99.html",
  "http://www.xiuren.org/tag/%E7%9C%9F%E7%A9%BA.html",
  "http://www.xiuren.org/tag/%E5%A4%A7%E5%A5%B6.html",
  "http://www.xiuren.org/tag/%E5%86%85%E8%A1%A3.html",
  "http://www.xiuren.org/tag/%E6%8C%BA%E6%8B%94.html",
  "http://www.xiuren.org/tag/%E9%A5%B1%E6%BB%A1.html",
  "http://www.xiuren.org/tag/%E7%BE%8E%E4%B9%B3.html",
  "http://www.xiuren.org/tag/%E6%A1%83%E4%B9%B3.html",
  "http://www.xiuren.org/tag/%E4%B9%8B%E5%90%8E%E5%B9%B2%E4%BA%86%E4%B8%AA%E7%88%BD.html",
  "http://www.xiuren.org/tag/%E8%83%B4%E4%BD%93.html",
  "http://www.xiuren.org/tag/%E8%95%BE%E4%B8%9D.html",
  "http://www.xiuren.org/tag/%E5%B0%8F%E7%99%BD%E5%85%94.html",
  "http://www.xiuren.org/tag/MyGirl.html",
  "http://www.xiuren.org/category/toutiaogirls.html",
  "http://www.xiuren.org/category/miitao.html",
  "http://www.xiuren.org/category/youwu.html",
  "http://www.xiuren.org/category/TuiGirl.html",
  "http://www.xiuren.org/category/ugirls.html",
  "http://www.xiuren.org/category/tgod.html",
  "http://www.xiuren.org/category/bololi.html",
  "http://www.xiuren.org/category/imiss.html",
  "http://www.xiuren.org/category/mistar.html",
  "http://www.xiuren.org/category/feilin.html",
  "http://www.xiuren.org/category/mfstar.html",
  "http://www.xiuren.org/category/vgirlmm.html",
  "http://www.xiuren.org/category/rayshen.html",
  "http://www.xiuren.org/category/uxing.html",
  "http://www.xiuren.org/category/AISS.html",
  "http://www.xiuren.org/category/ru1mm.html",
  "http://www.xiuren.org/category/donggan.html",
  "http://www.xiuren.org/category/XiuRen.html"
);

my $ua = Mojo::UserAgent->new;
my %visited;

while (@urls) {
  my $url = shift @urls;
  next if exists $visited{$url};
  $visited{$url} = 1;

  print "[$url]\n";

  my $tx = $ua->get($url)->res;
  if ($tx->dom('span=photoThum]')) {
    $tx->dom('span[class=photoThum]')->each(sub{
      my ($match_url) = ( m/<a href="([^"]*)" / );
      if ($match_url) {
        my ($site, $album, $file) = ($match_url =~ m/http:\/\/[^\/]*\/([^\/]*)\/([^\/]*)\/(.*)/);
        my ($f, $p, $s) = fileparse($file);
        if ($site && $album && $file && ! grep (/^$site\/$album\/$file/, @pics)) {
          mkdir "pics"; mkdir "pics/$site"; mkdir "pics/$site/$album";
          mkdir "pics/$site/$album/$p" if $p ne '.';
          print "getting $site/$album/$file...";
          my $ua_gif = Mojo::UserAgent->new(max_redirects => 5);
          my $gif = $ua_gif->get($match_url);
          $gif->res->content->asset->move_to("pics/$site/$album/$file");
          push @pics, "$site/$album/$file";
          print "done\n";
        }
      }
    });
  }
  if ($tx->dom('div[class=content]')) {
    $tx->dom('div[class=content]')->each(sub{
     
my ($match_url) = ( m/<a href="([^"]*)" / );
      if ($match_url && !grep (m/$match_url/, @extracted_links)) {
        push (@urls, $match_url);
        push (@extracted_links, $match_url);
      }
    });
  }
}

open $fp_pics, '>', "pics.txt";
binmode($fp_pics, ':utf8');
print $fp_pics join("\n", @pics), "\n";
close $fp_pics;

open $fp_extracted_links, '>', "extracted_links.txt";
binmode($fp_extracted_links, ':utf8');
print $fp_extracted_links join("\n", sort @extracted_links), "\n";
close $fp_extracted_links;

2017/10/01

讓chrome 可以讀取 local file

首先,要知道的就是,啟動命令中加上 --allow-file-access-from-files
做法上,有幾個地方可以進行
1. 可以在 ubuntu 快捷圖示修改,檔案在
  ~/.local/share/applications/google-chrome.desktop
  如果已經在的話直接修,否則參考以下內容:
[Desktop Entry]
Encoding=UTF-8
Version=1.0
Type=Application
Name=Welcome to Chrome - Google Chrome
Icon=google-chrome
Path=/home/wade
Exec=/opt/google/chrome/chrome --allow-file-access-from-files
StartupNotify=false
StartupWMClass=Google-chrome
OnlyShowIn=Unity;
X-UnityGenerated=true

2. 直接修改(如果你有權限的話) 啟動的 google-chrome script
通常是 /usr/bin/google-chrome
複雜的我們就不看了,只看最後一行,我的範例是:
exec -a "$0" "$HERE/chrome" --allow-cross-origin-auth-prompt --allow-file-access-from-files "$@"

3. 或者是修改安裝的目錄,例如 /opt/google/chrome/google-chrome

附註:事實上底下幾個位置是相同的(都是超連結)
/usr/bin/google-chrome
/etc/alternatives/google-chrome
/usr/bin/google-chrome-stable
/opt/google/chrome/google-chrome
最後一個才是普通檔,其他幾個最後都指向它