问题描述
|
我正在做一个图像搜索项目,并且想要获得一页的最大图像。
我添加了一些代码来修复图片的真实地址,删除可能是广告的图片。比较那里的宽*高回声最大。但是我的代码有问题。这是我的整个代码。任何人都可以帮助我修复错误的地方以及如何优化代码,我觉得这个过程很缓慢。谢谢大家。
<?PHP
require_once \'simple_html_dom.PHP\';
require \'url_to_absolute.PHP\'; //get image absolute url
$v = \'http://www.yomiuri.co.jp/stream/\';
$html = file_get_html($v);
$maxsize = -1;
$the_biggest_image = false;
$arr = array(\'ad\',\'ads\',\'gif\');// add ads possible words as a arry which is check in the image url
foreach($html->find(\'img\') as $element) {
preg_match_all(\'#https?://(.*?)($|/)#m\',urldecode(stripcslashes($v)),$r); //get site base url
$pic = $element->src;
$comm = url_to_absolute( $r[0][0],$pic);//get image absolute url
$check_flag = true;
foreach($arr as $item) {
if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
}// remove ads images
if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {
$maxsize = $arr[0] * $arr[1]; //compare images\' sise
$the_biggest_image = $comm;
echo \'<img src=\"\'.$the_biggest_image.\'\" />\'; //echo the biggest one
}
}
?>
url_to_absolute.PHP
<?PHP
/**
* Edited by Nitin Kr. Gupta,publicmind.in
*/
/**
* copyright (c) 2008,David R. Nadeau,NadeauSoftware.com.
* All rights reserved.
*
* Redistribution and use in source and binary forms,with or without
* modification,are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice,this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above
* copyright notice,this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* * Neither the names of David R. Nadeau or NadeauSoftware.com,nor
* the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE copYRIGHT HOLDERS AND CONTRIBUTORS
* \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
* LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND fitness
* FOR A PARTIculaR PURPOSE ARE disCLaimED. IN NO EVENT SHALL THE
* copYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,* INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL damAGES (INCLUDING,* BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE,DATA,OR PROFITS; OR BUSInesS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT
* LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY
* OF SUCH damAGE.
*/
/*
* This is a BSD License approved by the Open Source Initiative (OSI).
* See: http://www.opensource.org/licenses/bsd-license.PHP
*/
/**
* Combine a base URL and a relative URL to produce a new
* absolute URL. The base URL is often the URL of a page,* and the relative URL is a URL embedded on that page.
*
* This function implements the \"absolutize\" algorithm from
* the RFC3986 specification for URLs.
*
* This function supports multi-byte characters with the UTF-8 encoding,* per the URL specification.
*
* Parameters:
* baseUrl the absolute base URL.
*
* url the relative URL to convert.
*
* Return values:
* An absolute URL that combines parts of the base and relative
* URLs,or FALSE if the base URL is not absolute or if either
* URL cannot be parsed.
*/
function url_to_absolute( $baseUrl,$relativeUrl )
{
// If relative URL has a scheme,clean path and return.
$r = split_url( $relativeUrl );
if ( $r === FALSE )
return FALSE;
if ( !empty( $r[\'scheme\'] ) )
{
if ( !empty( $r[\'path\'] ) && $r[\'path\'][0] == \'/\' )
$r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
return join_url( $r );
}
// Make sure the base URL is absolute.
$b = split_url( $baseUrl );
if ( $b === FALSE || empty( $b[\'scheme\'] ) || empty( $b[\'host\'] ) )
return FALSE;
$r[\'scheme\'] = $b[\'scheme\'];
// If relative URL has an authority,clean path and return.
if ( isset( $r[\'host\'] ) )
{
if ( !empty( $r[\'path\'] ) )
$r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
return join_url( $r );
}
unset( $r[\'port\'] );
unset( $r[\'user\'] );
unset( $r[\'pass\'] );
// copy base authority.
$r[\'host\'] = $b[\'host\'];
if ( isset( $b[\'port\'] ) ) $r[\'port\'] = $b[\'port\'];
if ( isset( $b[\'user\'] ) ) $r[\'user\'] = $b[\'user\'];
if ( isset( $b[\'pass\'] ) ) $r[\'pass\'] = $b[\'pass\'];
// If relative URL has no path,use base path
if ( empty( $r[\'path\'] ) )
{
if ( !empty( $b[\'path\'] ) )
$r[\'path\'] = $b[\'path\'];
if ( !isset( $r[\'query\'] ) && isset( $b[\'query\'] ) )
$r[\'query\'] = $b[\'query\'];
return join_url( $r );
}
// If relative URL path doesn\'t start with /,merge with base path
if ( $r[\'path\'][0] != \'/\' )
{
$base = mb_strrchr( $b[\'path\'],\'/\',TRUE,\'UTF-8\' );
if ( $base === FALSE ) $base = \'\';
$r[\'path\'] = $base . \'/\' . $r[\'path\'];
}
$r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
return join_url( $r );
}
/**
* Filter out \".\" and \"..\" segments from a URL\'s path and return
* the result.
*
* This function implements the \"remove_dot_segments\" algorithm from
* the RFC3986 specification for URLs.
*
* This function supports multi-byte characters with the UTF-8 encoding,* per the URL specification.
*
* Parameters:
* path the path to filter
*
* Return values:
* The filtered path with \".\" and \"..\" removed.
*/
function url_remove_dot_segments( $path )
{
// multi-byte character explode
$inSegs = preg_split( \'!/!u\',$path );
$outSegs = array( );
foreach ( $inSegs as $seg )
{
if ( $seg == \'\' || $seg == \'.\')
continue;
if ( $seg == \'..\' )
array_pop( $outSegs );
else
array_push( $outSegs,$seg );
}
$outPath = implode( \'/\',$outSegs );
if ( $path[0] == \'/\' )
$outPath = \'/\' . $outPath;
// compare last multi-byte character against \'/\'
if ( $outPath != \'/\' &&
(mb_strlen($path)-1) == mb_strrpos( $path,\'UTF-8\' ) )
$outPath .= \'/\';
return $outPath;
}
/**
* This function parses an absolute or relative URL and splits it
* into individual components.
*
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
* A portion of the ABNFs are repeated here:
*
* URI-reference = URI
* / relative-ref
*
* URI = scheme \":\" hier-part [ \"?\" query ] [ \"#\" fragment ]
*
* relative-ref = relative-part [ \"?\" query ] [ \"#\" fragment ]
*
* hier-part = \"//\" authority path-abempty
* / path-absolute
* / path-rootless
* / path-empty
*
* relative-part = \"//\" authority path-abempty
* / path-absolute
* / path-noscheme
* / path-empty
*
* authority = [ userinfo \"@\" ] host [ \":\" port ]
*
* So,a URL has the following major components:
*
* scheme
* The name of a method used to interpret the rest of
* the URL. Examples: \"http\",\"https\",\"mailto\",\"file\'.
*
* authority
* The name of the authority governing the URL\'s name
* space. Examples: \"example.com\",\"user@example.com\",* \"example.com:80\",\"user:password@example.com:80\".
*
* The authority may include a host name,port number,* user name,and password.
*
* The host may be a name,an IPv4 numeric address,or
* an IPv6 numeric address.
*
* path
* The hierarchical path to the URL\'s resource.
* Examples: \"/index.htm\",\"/scripts/page.PHP\".
*
* query
* The data for a query. Examples: \"?search=google.com\".
*
* fragment
* The name of a secondary resource relative to that named
* by the path. Examples: \"#section1\",\"#header\".
*
* An \"absolute\" URL must include a scheme and path. The authority,query,* and fragment components are optional.
*
* A \"relative\" URL does not include a scheme and must include a path. The
* authority,and fragment components are optional.
*
* This function splits the $url argument into the following components
* and returns them in an associative array. Keys to that array include:
*
* \"scheme\" The scheme,such as \"http\".
* \"host\" The host name,IPv4,or IPv6 address.
* \"port\" The port number.
* \"user\" The user name.
* \"pass\" The user password.
* \"path\" The path,such as a file path for \"http\".
* \"query\" The query.
* \"fragment\" The fragment.
*
* One or more of these may not be present,depending upon the URL.
*
* Optionally,the \"user\",\"pass\",\"host\" (if a name,not an IP address),* \"path\",\"query\",and \"fragment\" may have percent-encoded characters
* decoded. The \"scheme\" and \"port\" cannot include percent-encoded
* characters and are never decoded. Decoding occurs after the URL has
* been parsed.
*
* Parameters:
* url the URL to parse.
*
* decode an optional boolean flag selecting whether
* to decode percent encoding or not. Default = TRUE.
*
* Return values:
* the associative array of URL parts,or FALSE if the URL is
* too malformed to recognize any parts.
*/
function split_url( $url,$decode=FALSE)
{
// Character sets from RFC3986.
$xunressub = \'a-zA-Z\\d\\-._~\\!$&\\\'()*+,;=\';
$xpchar = $xunressub . \':@% \';
// Scheme from RFC3986.
$xscheme = \'([a-zA-Z][a-zA-Z\\d+-.]*)\';
// User info (user + password) from RFC3986.
$xuserinfo = \'(([\' . $xunressub . \'%]*)\' .
\'(:([\' . $xunressub . \':%]*))?)\';
// IPv4 from RFC3986 (without digit constraints).
$xipv4 = \'(\\d{1,3}\\.\\d{1,3})\';
// IPv6 from RFC2732 (without digit and grouping constraints).
$xipv6 = \'(\\[([a-fA-F\\d.:]+)\\])\';
// Host name from RFC1035. Technically,must start with a letter.
// Relax that restriction to better parse URL structure,then
// leave host name validation to application.
$xhost_name = \'([a-zA-Z\\d-.%]+)\';
// Authority from RFC3986. Skip IP future.
$xhost = \'(\' . $xhost_name . \'|\' . $xipv4 . \'|\' . $xipv6 . \')\';
$xport = \'(\\d*)\';
$xauthority = \'((\' . $xuserinfo . \'@)?\' . $xhost .
\'?(:\' . $xport . \')?)\';
// Path from RFC3986. Blend absolute & relative for efficiency.
$xslash_seg = \'(/[\' . $xpchar . \']*)\';
$xpath_authabs = \'((//\' . $xauthority . \')((/[\' . $xpchar . \']*)*))\';
$xpath_rel = \'([\' . $xpchar . \']+\' . $xslash_seg . \'*)\';
$xpath_abs = \'(/(\' . $xpath_rel . \')?)\';
$xapath = \'(\' . $xpath_authabs . \'|\' . $xpath_abs .
\'|\' . $xpath_rel . \')\';
// Query and fragment from RFC3986.
$xqueryfrag = \'([\' . $xpchar . \'/?\' . \']*)\';
// URL.
$xurl = \'^(\' . $xscheme . \':)?\' . $xapath . \'?\' .
\'(\\?\' . $xqueryfrag . \')?(#\' . $xqueryfrag . \')?$\';
// Split the URL into components.
if ( !preg_match( \'!\' . $xurl . \'!\',$url,$m ) )
return FALSE;
if ( !empty($m[2]) ) $parts[\'scheme\'] = strtolower($m[2]);
if ( !empty($m[7]) ) {
if ( isset( $m[9] ) ) $parts[\'user\'] = $m[9];
else $parts[\'user\'] = \'\';
}
if ( !empty($m[10]) ) $parts[\'pass\'] = $m[11];
if ( !empty($m[13]) ) $h=$parts[\'host\'] = $m[13];
else if ( !empty($m[14]) ) $parts[\'host\'] = $m[14];
else if ( !empty($m[16]) ) $parts[\'host\'] = $m[16];
else if ( !empty( $m[5] ) ) $parts[\'host\'] = \'\';
if ( !empty($m[17]) ) $parts[\'port\'] = $m[18];
if ( !empty($m[19]) ) $parts[\'path\'] = $m[19];
else if ( !empty($m[21]) ) $parts[\'path\'] = $m[21];
else if ( !empty($m[25]) ) $parts[\'path\'] = $m[25];
if ( !empty($m[27]) ) $parts[\'query\'] = $m[28];
if ( !empty($m[29]) ) $parts[\'fragment\']= $m[30];
if ( !$decode )
return $parts;
if ( !empty($parts[\'user\']) )
$parts[\'user\'] = rawurldecode( $parts[\'user\'] );
if ( !empty($parts[\'pass\']) )
$parts[\'pass\'] = rawurldecode( $parts[\'pass\'] );
if ( !empty($parts[\'path\']) )
$parts[\'path\'] = rawurldecode( $parts[\'path\'] );
if ( isset($h) )
$parts[\'host\'] = rawurldecode( $parts[\'host\'] );
if ( !empty($parts[\'query\']) )
$parts[\'query\'] = rawurldecode( $parts[\'query\'] );
if ( !empty($parts[\'fragment\']) )
$parts[\'fragment\'] = rawurldecode( $parts[\'fragment\'] );
return $parts;
}
/**
* This function joins together URL components to form a complete URL.
*
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
* This function implements the specification\'s \"component recomposition\"
* algorithm for combining URI components into a full URI string.
*
* The $parts argument is an associative array containing zero or
* more of the following:
*
* \"scheme\" The scheme,such as a file path for \"http\".
* \"query\" The query.
* \"fragment\" The fragment.
*
* The \"port\",\"user\",and \"pass\" values are only used when a \"host\"
* is present.
*
* The optional $encode argument indicates if appropriate URL components
* should be percent-encoded as they are assembled into the URL. Encoding
* is only applied to the \"user\",\"host\" (if a host name,not an
* IP address),\"path\",and \"fragment\" components. The \"scheme\"
* and \"port\" are never encoded. When a \"scheme\" and \"host\" are both
* present,the \"path\" is presumed to be hierarchical and encoding
* processes each segment of the hierarchy separately (i.e.,the slashes
* are left alone).
*
* The assembled URL string is returned.
*
* Parameters:
* parts an associative array of strings containing the
* individual parts of a URL.
*
* encode an optional boolean flag selecting whether
* to do percent encoding or not. Default = true.
*
* Return values:
* Returns the assembled URL string. The string is an absolute
* URL if a scheme is supplied,and a relative URL if not. An
* empty string is returned if the $parts array does not contain
* any of the needed values.
*/
function join_url( $parts,$encode=FALSE)
{
if ( $encode )
{
if ( isset( $parts[\'user\'] ) )
$parts[\'user\'] = rawurlencode( $parts[\'user\'] );
if ( isset( $parts[\'pass\'] ) )
$parts[\'pass\'] = rawurlencode( $parts[\'pass\'] );
if ( isset( $parts[\'host\'] ) &&
!preg_match( \'!^(\\[[\\da-f.:]+\\]])|([\\da-f.:]+)$!ui\',$parts[\'host\'] ) )
$parts[\'host\'] = rawurlencode( $parts[\'host\'] );
if ( !empty( $parts[\'path\'] ) )
$parts[\'path\'] = preg_replace( \'!%2F!ui\',rawurlencode( $parts[\'path\'] ) );
if ( isset( $parts[\'query\'] ) )
$parts[\'query\'] = rawurlencode( $parts[\'query\'] );
if ( isset( $parts[\'fragment\'] ) )
$parts[\'fragment\'] = rawurlencode( $parts[\'fragment\'] );
}
$url = \'\';
if ( !empty( $parts[\'scheme\'] ) )
$url .= $parts[\'scheme\'] . \':\';
if ( isset( $parts[\'host\'] ) )
{
$url .= \'//\';
if ( isset( $parts[\'user\'] ) )
{
$url .= $parts[\'user\'];
if ( isset( $parts[\'pass\'] ) )
$url .= \':\' . $parts[\'pass\'];
$url .= \'@\';
}
if ( preg_match( \'!^[\\da-f]*:[\\da-f.:]+$!ui\',$parts[\'host\'] ) )
$url .= \'[\' . $parts[\'host\'] . \']\'; // IPv6
else
$url .= $parts[\'host\']; // IPv4 or name
if ( isset( $parts[\'port\'] ) )
$url .= \':\' . $parts[\'port\'];
if ( !empty( $parts[\'path\'] ) && $parts[\'path\'][0] != \'/\' )
$url .= \'/\';
}
if ( !empty( $parts[\'path\'] ) )
$url .= $parts[\'path\'];
if ( isset( $parts[\'query\'] ) )
$url .= \'?\' . $parts[\'query\'];
if ( isset( $parts[\'fragment\'] ) )
$url .= \'#\' . $parts[\'fragment\'];
return $url;
}
/**
* This function encodes URL to form a URL which is properly
* percent encoded to replace disallowed characters.
*
* RFC3986 specifies the allowed characters in the URL as well as
* reserved characters in the URL. This function replaces all the
* disallowed characters in the URL with their repective percent
* encodings. Already encoded characters are not encoded again,* such as \'%20\' is not encoded to \'%2520\'.
*
* Parameters:
* url the url to encode.
*
* Return values:
* Returns the encoded URL string.
*/
function encode_url($url) {
$reserved = array(
\":\" => \'!%3A!ui\',\"/\" => \'!%2F!ui\',\"?\" => \'!%3F!ui\',\"#\" => \'!%23!ui\',\"[\" => \'!%5B!ui\',\"]\" => \'!%5D!ui\',\"@\" => \'!%40!ui\',\"!\" => \'!%21!ui\',\"$\" => \'!%24!ui\',\"&\" => \'!%26!ui\',\"\'\" => \'!%27!ui\',\"(\" => \'!%28!ui\',\")\" => \'!%29!ui\',\"*\" => \'!%2A!ui\',\"+\" => \'!%2B!ui\',\",\" => \'!%2C!ui\',\";\" => \'!%3B!ui\',\"=\" => \'!%3D!ui\',\"%\" => \'!%25!ui\',);
$url = rawurlencode($url);
$url = preg_replace(array_values($reserved),array_keys($reserved),$url);
return $url;
}
?>
解决方法
您尚未真正说出您遇到了什么错误,但是幸运的是,您的代码中有几个错误。可能会给您带来错误的代码在此块中:
if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {
$maxsize = $arr[0] * $arr[1]; //compare images\' sise
$the_biggest_image = $comm;
echo \'<img src=\"\'.$the_biggest_image.\'\" />\'; //echo the biggest one
}
您重写“ 3”,它是您的“ ads”过滤变量。
如果ѭ4false为假,您仍需执行以下计算语句
reset()
不适用于弦。
每当您更新最大尺寸时,您都会回显$the_biggest_image
。那是故意的吗?
更新
尝试使您的代码正常工作,并希望稍微好一些:
<?php
require_once \'simple_html_dom.php\';
require \'url_to_absolute.php\'; //get image absolute url
// options
$url = \'http://www.yomiuri.co.jp/stream/\';
$ignore = array(\'ad\',\'ads\',\'gif\');// add ads possible words as a arry which is check in the image url
$biggestImage = \'path to \"no image found\" image\';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts[\'scheme\'].\'://\'.$parts[\'host\'];
// loop images
foreach($html->find(\'img\') as $element) {
$pic = $element->src;
if($pic==\'\')continue;// it happens on your test url
$absUrl = url_to_absolute($host,$pic);//get image absolute url
// ignore already seen images,add new images
if(in_array($absUrl,$visited))continue;
$visited[]=$absUrl;
// remove ads images
$ignoring=false;
foreach($ignore as $item)
if (stripos($absUrl,$item)!==false){
$ignoring=true;
break;
}
if($ignoring)continue;
// get image
$image=@getimagesize($absUrl);// get the rest images width and height
if (($image[0] * $image[1]) > $maxSize) {
$maxSize = $image[0] * $image[1]; //compare images\' sise
$biggestImage = $absUrl;
}
}
echo \'<img src=\"\'.$biggestImage.\'\" />\'; //echo the biggest one
?>
,根据您的代码,我创建了以下解决方案-它使用相同的逻辑,并且可以让您设置图像的最小宽度和高度,以确保其返回正确的图像
private function getMainImageFromUrl($pageUrl) {
$biggestImage = \'\';
$minImgWidth = 300;
$minImgHeight = 300;
$images = $this->getImagesFromDom($pageUrl);
$visited = array();
$maxSize = -1;
$ignore = array(\'ad\',\'gif\'); // get rid of ads (check if these contains following)
foreach ($images as $image) {
$pic = $image->getAttribute(\'src\');
# if source is empty,skip to another image
if ( empty( $pic ) )
continue;
# get image absolute url
$absUrl = url_to_absolute($pic);
# ignore already seen images (skip to another),add new images
if ( in_array( $absUrl,$visited ) )
continue;
$visited[] = $absUrl;
# remove ads
$ignoring = false;
foreach($ignore as $item)
if ( stripos( $absUrl,$item ) !== false ){
$ignoring=true;
break;
}
if ( $ignoring )
continue;
$imageSize = @getimagesize($absUrl);
if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
$maxSize = $imageSize[0] * $imageSize[1];
if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
$biggestImage = $absUrl;
}
}
return $biggestImage;
}
private function getImagesFromDom( $url ) {
ini_set(\'default_socket_timeout\',4);
$dom = new DOMDocument();
@$dom->loadHTMLFile( $url );
$dom->preserveWhiteSpace = false;
# Get images from DOM
return $dom->getElementsByTagName(\'img\');
}