画像ぶっこ抜くスクリプト書いてみた

クローラーを作ってみた

制約
検索エンジンのクローラーみたいなのは迷惑この上ないスクリプトになっちゃう
サイトを指定してそのサイト内を再帰的に読み込んで画像ぶっこ抜く

目的
JavaじゃなくてPHPのオブジェクト指向でなんか書く

どうしようかなっと思った事

・ストレージ
DBなどのストレージを使用すれば、途中で処理を中止しても続きから処理できるが
使用する人の環境によってはDB構築のハードルがあると思ったので、スクリプト内で完結させた。電源切っちゃうと、初めからやり直す

・負荷対策
そのままの速度でプログラムに処理させると、接続先サーバーに多大なる迷惑をかけるので
処理速度をメソッドで調節。

・使用手順

以下に記述する5つのソースを読みこんで以下のように実行↓

include_once("snoopy.class.php");
include_once("htmlsql.class.php");
include_once("linkbuilder.class.php");
include_once("downloader.class.php");
include_once("crawler.class.php");
include_once('Log/Log.php');
$rscan = new Crawler();
$rscan->recursiveScan("http://example.com");

LOG機能
PEARのLogを使用。ログはOUT.Logファイルに書かれる

include_once('Log/Log.php');

以下に記述するオプションを指定する際はクローラーオブジェクト生成後、メソッドを呼び出す。

$rscan = new Crawler();

オプション１
解析するページ数のセット(デフォルトでは1ページ読み込むと終了。↓20Pの場合)

$rscan->setPages(20);

オプション２
解析速度の調整(10秒に1度、5httpリクエストを投げる)

$rscan->setTimerInOneCrawl(10);
$rscan->setPagesInOneCrawl(5);

オプション３
リファラーのセット

$rscan->setReferer("http://example.com/");

オプション４
ユーザーエージェントのセット(例えば、ブラウザはfirefox,OSはMac Osの時↓)

$rscan->setUserAgent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; ja-JP-mac; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13");

オプション５
画像ダウンロード先の指定(DLディレクトリを生成)
指定しなければカレントディレクトリに画像が保存される。
パス指定は、MACのファイルシステムの場合(デスクトップに生成)↓

$rscan->setDownloadPath("/Users/yourname/desktop/download_folder");

最後に再帰的に解析。

$rscan->recursiveScan("http://example.com");

以下、読み込むスクリプトファイルの概要

1 htmlsql.class.php

HTMLの抽出は面倒なのでオープンソースのhtmlsqlを使用。

こっからDL↓
htmlsql

2 snoopy.class.php
HTTP Responseの取得等オープンソースのsnoopyを使用。

こっからDL↓
Snoopy | Download Snoopy software for free at SourceForge.net

ちなみにfunctionは全部publicにしてるのはお恥ずかしいです........

3 linkbuilder.class.php
webには様々な種類のリンクがあるのでそれを絶対パスに変換するため作成

class LinkBuilder{

  /*
   ** getAbsolutePath
   **
   ** convert a link into the absolute pass
   */


  public function getAbsolutePath($link,$url){
    $parse_url = parse_url($url);
    if(self::isRootPath($link)){
      $absolute_path = $parse_url[scheme]."://".$parse_url[host].$link;
    }else if(self::isAbsolutePath($link)){
      $absolute_path = $link;
    }else if(self::isCurrentPath($link)){
      if(strcmp(self::pullOutLastWordOfLink,"/") == 0){
        $absolute_path = $url.substr($link,2);
      }else{
        $absolute_path = dirname($url).substr($link,1);
      }
    }else if(self::isParentPath($link)){
      $count_parentpath = substr_count($link,"../");
      for($i = 0;$i < $count_parentpath;$i ++)
      $url = dirname($url);
      $absolute_path = $url."/".substr($link,mb_strlen("../")*$count_parentpath);
    }else if(self::isLiteralPath($link)){
      $absolute_path = $parse_url[scheme]."://".$parse_url[host]."/".$link;
    }
    return $absolute_path;
  }

  /*
   ** obtainInsideLink
   **
   ** extract only inside link from the absolute pass
   */


  public function obtainInsideLink($absolute_path,$url){
    $parse_url = parse_url($url);
    $last_slash_position = strrpos($absolute_path, "/");
    $file_extension = substr($absolute_path,$last_slash_position+1);
    if(!$absolute_path){
      Throw New ScanningException("[anchor error]url「".$absolute_path."」 is unjust link.");
    }
    if(preg_match("/\.[a-z]+/",$file_extension)){//リンクが拡張子指定の時
      if(!strpos($file_extension,".html")
      && !strpos($file_extension,".htm")
      && !strpos($file_extension,".php")
      ){
        Throw New ScanningException("[anchor error]url「".$absolute_path."」 is not document file.");
      }
    }
    if(strpos($absolute_path,"#")){
      Throw New ScanningException("[anchor error]url「".$absolute_path."」 is link of #　attribute.");
    }
    if(!strpos($absolute_path,$parse_url["host"])){
      Throw New ScanningException("[anchor error]url「".$absolute_path."」 is external link.");
    }
    return $absolute_path;
  }

  /*
   ** pullOutFirstSlashPartOfLink
   **
   ** extract only first part splitted by slash from the link
   */


  public function pullOutFirstSlashPartOfLink($link){
    return strstr($link, '/', true);
  }

  /*
   ** pullOutLastSlashPartOfLink
   **
   ** extract only last part splitted by slash from the link
   */


  public function pullOutLastSlashPartOfLink($link){
    $array_splitted_by_slash = split("/", $link);
    $i = count($array_splitted_by_slash)-1;
    return $array_splitted_by_slash[$i];
  }

  /*
   ** pullOutFirstWordOfLink
   **
   ** pull out first word of the link
   */


  public function pullOutFirstWordOfLink($link){
    return substr($link, 0, 1);
  }

  /*
   ** pullOutLastWordOfLink
   **
   ** pull out last word of the link
   */


  public function pullOutLastWordOfLink($link){
    return substr($link, -1, 1);
  }

  /*
   ** isRootPath
   **
   ** judge whether the pass appointment of the link begins with the route
   */


  public function isRootPath($link){

    if(self::pullOutFirstSlashPartOfLink($link) != NULL){
      return false;
    }
    if(strcmp(self::pullOutFirstWordOfLink($link),"/") != 0){
      return false;
    }
    return true;
  }

  /*
   ** isLiteralPath
   **
   ** judge whether the pass appointment of the link begins with the litetal
   */


  public function isLiteralPath($link){

    if(!self::pullOutFirstSlashPartOfLink($link)){
      return false;
    }
    if(strcmp(self::pullOutFirstWordOfLink($link),"/") == 0){
      return false;
    }
    return true;
  }

  /*
   ** isCurrentPath
   **
   ** judge whether the pass appointment of the link begins with the words "./"
   */


  public function isCurrentPath($link){
    if(strcmp(self::pullOutFirstSlashPartOfLink($link),".") != 0){
      return false;
    }
    return true;
  }

  /*
   ** isParenPath
   **
   ** judge whether the pass appointment of the link begins with the words "../"
   */

  public function isParentPath($link){
    if(strcmp(self::pullOutFirstSlashPartOfLink($link),"..") != 0){
      return false;
    }
    return true;
  }

  /*
   ** isAbsolutPath
   **
   ** judge whether the pass appointment of the link begins with the words "http:"
   */

  public function isAbsolutePath($link){
    if(strcmp(self::pullOutFirstSlashPartOfLink($link),"http:") != 0){
      return false;
    }
    return true;
  }
}

4 crawler.php再帰的にリンクをクロールする機能を持たせた

class HtmlsqlException extends Exception{}
class ScanningException extends Exception{}
class Crawler{
  //substitute true if Crawrer read the pages that the user set,and stop crawling
  private $flag_finish = false;
  //htmlSQL object
  private $obj_htmlsql;
  //userAgent
  private $user_agent;
  //refferer
  private $referer;
  //the downloadPath for image file
  private $download_path;
  //Number of crawling pages
  private $page_counter_for_crawler = 0;
  //Number of user's setting pages
  private $page_counter_for_user = 1;
  //Number of pages that crawler scan once
  private $pages_in_one_crawl = 5;
  //Number of seconds to take to make the crawl once
  private $timer_in_one_crawl = 10;
  //The memory of the document url that crawler scanned
  private $doc_link_of_memory = array();
  //The memory of the img url that crawler scanned
  private $img_link_of_memory = array();
  //The config of log (PHP:PEAR)
  private $conf = array('mode' => 0777,'timeFormat' => '%X %x');

  /*
   ** setReferer
   **
   ** sets the referer
   */

  public function setReferer($referer){
    $this->referer = $referer;
  }


  /*
   ** setPages
   **
   ** set pages that crawler scan
   */

  public function setPages($pages){
    $this->page_counter_for_user = $pages;
  }


  /*
   ** setDownloadPath
   **
   ** set a custom path for downloading image file to local
   */

  public function setDownloadPath($path){
    $laststring_of_downloadpath = substr($path,-1,1);
    if(strcmp($laststring_of_downloadpath,"/") == 0){
      $this->download_path = $path;
    }else{
      $this->download_path = $path."/";
    }
  }


  /*
   ** setUserAgent
   **
   ** set a custom user agent
   */

  public function setUserAgent($ua){
    $this->user_agent = $ua;
  }

  /*
   ** setTimerInOneCrawl
   **
   ** set number of pages that crawler scan once
   */


  public function setTimerInOneCrawl($seconds){
    $this->timer_in_one_crawl = $seconds;
  }

  /*
   ** setPagesInOneCrawl
   **
   ** set number of seconds to take to make the crawl once
   */


  public function setPagesInOneCrawl($pages){
    $this->pages_in_one_crawl = $pages;
  }

  /*
   ** setInfoOfCrawler
   **
   ** set refferer,userAgent,and regurate crawl speed not to give connection server much load
   */

  public function setInfoOfCrawler($obj_htmlsql,$url,$referer = null){

    if($this->page_counter_for_crawler == 0){
      $obj_htmlsql->set_referer($this->referer);
      $this->doc_link_of_memory[] = $url;
    }else{
      $obj_htmlsql->set_referer($referer);
    }

    if($this->user_agent){
      $obj_htmlsql->set_user_agent($this->user_agent);
    }
    $this->regurateCrawlSpeed();
  }

  /*
   ** regurateCrawlSpeed
   **
   ** set crawl pages and seconds in a crawl not to give connection server much load
   */


  public function regurateCrawlSpeed(){
    if($this->page_counter_for_crawler % $this->pages_in_one_crawl == 0 && $this->page_counter_for_crawler != 0){
      sleep($this->timer_in_one_crawl);
    }
  }

  /*
   ** connect
   **
   ** send http request to designated url
   */


  public function connect($url,$referer = null){

    $this->obj_htmlsql = new htmlsql();
    $this->setInfoOfCrawler($this->obj_htmlsql,$url,$referer);

    if (!$this->obj_htmlsql->connect('url',$url)){
      Throw New HtmlsqlException('Error while connecting:'. $this->obj_htmlsql->error);
    }

    //$obj_log = Logger::getLogObj();
    $obj_log = Log::singleton('file', 'out.log',$this->conf);
    $obj_log->log("Page Number「".$this->page_counter_for_crawler."」,Url「".$url."」Now Scanning",PEAR_LOG_INFO);
    $this->page_counter_for_crawler ++;
  }

  /*
   ** getImgTag
   **
   ** send http request to designated url
   */


  public function getImgTag($url){
    if (!$this->obj_htmlsql->query('SELECT src FROM img')){
      Throw New HtmlsqlException('Query  obj_htmlsql->error);
    }
    foreach($this->obj_htmlsql->fetch_array() as $key => $img_url){
      if(!strpos($img_url[src],"#")){
        $img_tag = LinkBuilder::getAbsolutePath($img_url[src],$url);
        if (!in_array($img_tag,$this->img_link_of_memory)){
          $result[] = $img_tag;
          $this->img_link_of_memory[] = $url;
        }
      }
    }
    return $result;
  }

  /*
   ** getAnchorTag
   **
   ** send http request to designated url
   */


  public function getAnchorTag($url){
    //リンク検査
    if(!$this->obj_htmlsql->query('SELECT href From a')){
      Throw New HtmlsqlException('Query obj_htmlsql->error);
    }
    foreach($this->obj_htmlsql->fetch_array() as $key =>$link){
      $link = $link[href];
      try{
        $absolute_path = LinkBuilder::getAbsolutePath($link,$url);
        $inside_link = LinkBuilder::obtainInsideLink($absolute_path,$url);

        if (in_array($inside_link,$this->doc_link_of_memory)){
          Throw New ScanningException("[anchor error]url「".$inside_link." is scannned already.");
        }

        $result[] = $inside_link;
        $this->doc_link_of_memory[] = $inside_link;

      }catch(ScanningException $e){
        //$obj_log = Logger::getLogObj();
        $obj_log = Log::singleton('file', 'out.log',$this->conf);
        $obj_log->log($e.$url, PEAR_LOG_INFO);
      }
    }
    return $result;
  }

  /*
   ** recursiveScan
   **
   ** scan designated site recursively
   */


  public function recursiveScan($url,$referer = null){
    try{
      if($this->page_counter_for_crawler == $this->page_counter_for_user){
        $this->flag_finish = true;
      }
      if($this->flag_finish){
        return;
      }

      $this->connect($url,$referer);
      $img_parse_array = $this->getImgTag($url);
      Downloader::downloadImgFile($img_parse_array,$this->download_path);
      $anchor_parse_array = $this->getAnchorTag($url);
    }catch(HtmlsqlException $e){
      //$obj_log = Logger::getLogObj();
      $obj_log = Log::singleton('file', 'out.log',$this->conf);
      $obj_log->log('cannot access '.$link, PEAR_LOG_INFO);
    }
    if(!$anchor_parse_array){
      return;
    }
    foreach($anchor_parse_array as $key =>$link){
      $this->recursiveScan($link,$url);
    }
  }
}

5 downloader.class.php 画像ファイルをダウンロードする機能を持ったクラス

class Downloader{

  /*
   ** nameImgFile
   **
   ** confirm whether the file is already downloaded and name it
   */

  public function nameImgFile($img_url,$path_to_local_file){
    $file_name = LinkBuilder::pullOutLastSlashPartOfLink($img_url);
    return $file_name;
  }

  /*
   ** makeImgFile
   **
   ** write in binary data at a file
   */

  public function makeImgFile($img_url,$file_name,$download_path){
    $local_img = sprintf($download_path.$file_name);
    $img = @file_get_contents($img_url);
    if($img){
      $img_rsc = fopen($local_img,"w");
      flock($img_rsc,LOCK_EX);
      fwrite($img_rsc,$img);
      flock($img_rsc,LOCK_UN);
      fclose($img_rsc);
    }
  }


  /*
   ** downloadImgFile
   **
   ** download img file to local
   */


  public function downloadImgFile($img_source_array,$download_path){

    @mkdir($download_path,0755,true);
    foreach($img_source_array as $key =>$img_url){
      $file_name = self::nameImgFile($img_url,$download_path);
      self::makeImgFile($img_url,$file_name,$download_path);
    }
  }
}

mind-tech

主にテクノロジーや啓発系、副業などのテーマ中心

画像ぶっこ抜くスクリプト書いてみた