對 wget 這個 tool 不熟,平常使用 wget 下載一些資料時,可以輕易地使用 --referer 來偽造 HTTP Header 資料,因此能夠通過對方 Server 檢查
wget --referer="REFERER_URL" "TARGET_URL"
然而,上述的 REFERER_URL 和 TARGET_URL 都是固定的位置,如果是會根據 session / cookie 的而改變的話,不曉得還有沒有辦法?對我而言,寫 PHP 比去看 manpage 來得快 XD 所以我就寫成 PHP 囉!或許 wget 也有更方便的下法吧,改天再努力看 manpage
程式碼:
<?php
$output_file = 'result.file'; // 儲存結果
$cookie_file = 'cookie.tmp'; // cookie file
$source_url = 'SOURCE_URL'; // 之後會變成 REFERER_URL
$pattern = '/class="download" href="(.*?)"/'; // 此為一個範例, 用來撈 TARGET_URL
$ch = curl_init();
curl_setopt( $ch , CURLOPT_URL, $source_url );
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$result = curl_exec( $ch );
if( preg_match_all( $pattern , $result , $match ) )
{
if( isset( $match[1][1] ) )
{
$target_url = $match[1][1]; // 請依 pattern 決定
$referer_url = $source_url;
curl_setopt( $ch , CURLOPT_URL, $target_url );
curl_setopt( $ch , CURLOPT_REFERER , $referer_url );
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
//curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$fp = fopen ( $output_file, 'wb' );
curl_setopt( $ch , CURLOPT_FILE , $fp );
echo "GO...\n";
curl_exec( $ch );
echo "Finish..\n";
fclose( $fp );
}
}
curl_close( $ch );
?>
以上是要從 SOURCE_URL 上頭, 找到下載位置(target_url), 然而, 那個位置卻每次都不一樣, 最重要的是跟 session 有關係並且下載 target_url 時還必須奉上 cookie 資訊, 所以, 先收集一下 cookie 囉!(上述程式並不謹慎, 例如儲存結果的檔案有可能開檔失敗)
後記,無聊又改寫成 tool mode:
<?php
$shortopt = array();
$shortopt['h'] = array(
'value' => '' ,
'text' => '-h, help' );
$shortopt['c:'] = array(
'value' => '' ,
'text' => "-c '/tmp/cookie_file' , tmp file for cookie" );
$shortopt['o:'] = array(
'value' => '' ,
'text' => "-o '/tmp/output_file' , path for result file. default use stdout" );
$shortopt['u:'] = array(
'value' => NULL ,
'text' => "-u 'http://www.google.com' , source url" );
$shortopt['e:'] = array(
'value' => NULL ,
'text' => "-e '/class=\"normal-down\" href=\"(.*?)\"/is' , regexp pattern for extract the target url" );
$shortopt['m:'] = array(
'value' => '' ,
'text' => "-m '1,1' , choose the result matched to be used. e.g. use the match[5][2] is '5,2'" );
$shortopt['d'] = array(
'value' => 'true' ,
'text' => "-d , disable test mode for showing the target matched by regexp pattern" );
// check function
if( !function_exists( 'getopt' ) )
{
echo "'getopt' is not supported in current PHP version.\n";
exit;
}
// help menu
$shortopt_list = '';
$shottopt_help = '';
foreach( $shortopt as $k => $v )
{
$shortopt_list .= $k;
$shottopt_help .= "\t".$v['text']."\n";
}
// start to parse...
$parse_arg = getopt( $shortopt_list );
// show help
if( isset( $parse_arg['h'] ) )
{
echo "Usage> php ".$argv[0]." -h\n";
echo $shottopt_help;
exit;
}
// set the value
foreach( $parse_arg as $k => $v )
{
if( isset( $shortopt[$k] ) )
$shortopt[$k]['value'] = !strcasecmp( $shortopt[$k]['value'] , 'false' ) ? true : false ;
else if( isset( $shortopt[$k.':'] ) )
$shortopt[$k.':']['value'] = $v;
}
// check value (cannot be NULL)
$check_out = '';
foreach( $shortopt as $k => $v )
if( !isset( $v['value'] ) )
$check_out .= "\t".$v['text']."\n";
if( !empty( $check_out ) )
{
echo "Usage> php ".$argv[0]." -h\n";
echo "Must Set:\n$check_out\n";
exit;
}
$cookie_file = !empty( $shortopt['c:']['value'] ) ? $shortopt['c:']['value'] : NULL ;
$source_url = $shortopt['u:']['value'];
$output_file = !empty( $shortopt['o:']['value'] ) ? $shortopt['o:']['value'] : NULL ;
$regexp_pattern = $shortopt['e:']['value'];
if( !empty( $shortopt['m:']['value'] ) )
$shortopt['m:']['value'] = trim( $shortopt['m:']['value'] );
$choose_match = !empty( $shortopt['m:']['value'] ) ? explode( ',' , $shortopt['m:']['value'] ) : NULL;
$test_mode = empty( $choose_match ) || $shortopt['d']['value'];
$ch = curl_init();
curl_setopt( $ch , CURLOPT_URL, $source_url );
if( !empty( $cookie_file ) )
{
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
}
curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$result = curl_exec( $ch );
if( preg_match_all( $regexp_pattern , $result , $matches ) )
{
$target_url = getTargetURL( $matches , $choose_match );
if( $test_mode || empty( $target_url ) )
{
echo "Matched Target URL: \n";
print_r( $matches );
echo "Choose option(Cannot be empty):".$shortopt['m:']['value']."\n";
echo "Target(Cannot be empty):$target_url\n";
}
else
{
curl_setopt( $ch , CURLOPT_URL, $target_url );
curl_setopt( $ch , CURLOPT_REFERER , $source_url );
if( !empty( $cookie_file ) )
{
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
}
if( !empty( $output_file ) )
{
echo "Target URL:$target_url\n";
echo "Referer URL:$source_url\n";
if( ( $fp = fopen ( $output_file , 'wb' ) ) == NULL )
{
echo "ERROR: Cannot open the output file to write:$output_file\n";
exit;
}
curl_setopt( $ch , CURLOPT_FILE , $fp );
echo "Begin...\n";
curl_exec( $ch );
echo "...Finish\n";
fclose( $fp );
}
else
{
curl_exec( $ch );
}
}
}
curl_close( $ch );
exit;
function getTargetURL( $matches , $choose )
{
if( !isset( $matches ) )
return NULL;
if( is_array( $matches ) && is_array( $choose ) && count( $choose ) > 0 )
{
$index = array_shift( $choose );
if( isset( $matches[ $index ] ) )
return getTargetURL( $matches[ $index ] , $choose );
return NULL;
}
if( !is_array( $matches ) )
return $matches;
else if( isset( $matches[ $choose ] ) )
return $matches[ $choose ];
return NULL;
}
?>
用法:
單純以抓 Yahoo! New 為例
尚未指定 -m
# php my_wget.php -u 'http://tw.yahoo.com' -e '/<h3><a href="([^"]+)" title="([^"]+)"/is'
Matched Target URL:
Array
(
[0] => Array
(
[0] => <h3><a href="news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html" title="莫拉克風災 學者:無關暖化"
[1] => <h3><a href="news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html" title="立院藏七寶 總價數億元"
)
[1] => Array
(
[0] => news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html
[1] => news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
)
[2] => Array
(
[0] => 莫拉克風災 學者:無關暖化
[1] => 立院藏七寶 總價數億元
)
)
Choose option(Cannot be empty):
Target(Cannot be empty):
指定 -m '1,1'
# php my_wget.php -u 'http://tw.yahoo.com' -e '/<h3><a
href="([^"]+)" title="([^"]+)"/is' -m '1,1'
Matched Target URL:
Array
(
[0] => Array
(
[0] => <h3><a href="news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html" title="莫拉克風災 學者:無關暖化"
[1] => <h3><a href="news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html" title="立院藏七寶 總價數億元"
)
[1] => Array
(
[0] => news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html
[1] => news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
)
[2] => Array
(
[0] => 莫拉克風災 學者:無關暖化
[1] => 立院藏七寶 總價數億元
)
)
Choose option(Cannot be empty):1,1
Target(Cannot be empty):news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
正式要下載請記得加 -d (disable test) , 但此例不適用, 因為抓出來的 url 並不完整, 開頭只是 "news/a/h2/t/*....."
# php my_wget.php -u 'http://tw.yahoo.com' -e '/<h3><a
href="([^"]+)" title="([^"]+)"/is' -m '1,1' -d
輸出到檔案
# php my_wget.php -u
'http://tw.yahoo.com' -e '/<h3><a
href="([^"]+)" title="([^"]+)"/is' -m '1,1' -d -o '/tmp/output'
需要 cookie
# php my_wget.php -u
'http://tw.yahoo.com' -e '/<h3><a
href="([^"]+)" title="([^"]+)"/is' -m '1,1' -d -o '/tmp/output' -c '/tmp/cookie'