获得最大图像

问题描述

| 我正在做一个图像搜索项目,并且想要获得一页的最大图像。 我添加了一些代码来修复图片的真实地址,删除可能是广告的图片。比较那里的宽*高回声最大。但是我的代码有问题。这是我的整个代码。任何人都可以帮助我修复错误的地方以及如何优化代码,我觉得这个过程很缓慢。谢谢大家。
<?PHP
require_once \'simple_html_dom.PHP\';
require \'url_to_absolute.PHP\'; //get image absolute url
$v = \'http://www.yomiuri.co.jp/stream/\';
$html = file_get_html($v);
$maxsize = -1; 
$the_biggest_image = false;
$arr = array(\'ad\',\'ads\',\'gif\');// add ads possible words as a arry which is check in the image url
foreach($html->find(\'img\') as $element) {
    preg_match_all(\'#https?://(.*?)($|/)#m\',urldecode(stripcslashes($v)),$r); //get site base url
    $pic = $element->src;
    $comm = url_to_absolute( $r[0][0],$pic);//get image absolute url
    $check_flag = true;
    foreach($arr as $item) {
        if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
    }// remove ads images
    if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
    reset($comm);
        if (($arr[0] * $arr[1]) > $maxsize) {   
            $maxsize = $arr[0] * $arr[1];  //compare images\' sise
            $the_biggest_image = $comm;
            echo \'<img src=\"\'.$the_biggest_image.\'\" />\'; //echo the biggest one
        }
}
?>
url_to_absolute.PHP
<?PHP
/**
 * Edited by Nitin Kr. Gupta,publicmind.in
 */

/**
 * copyright (c) 2008,David R. Nadeau,NadeauSoftware.com.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms,with or without
 * modification,are permitted provided that the following conditions
 * are met:
 *
 *  * Redistributions of source code must retain the above copyright
 *    notice,this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above
 *    copyright notice,this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *  * Neither the names of David R. Nadeau or NadeauSoftware.com,nor
 *    the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE copYRIGHT HOLDERS AND CONTRIBUTORS
 * \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
 * LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND fitness
 * FOR A PARTIculaR PURPOSE ARE disCLaimED. IN NO EVENT SHALL THE
 * copYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,* INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL damAGES (INCLUDING,* BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE,DATA,OR PROFITS; OR BUSInesS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT
 * LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH damAGE.
 */

/*
 * This is a BSD License approved by the Open Source Initiative (OSI).
 * See:  http://www.opensource.org/licenses/bsd-license.PHP
 */

/**
 * Combine a base URL and a relative URL to produce a new
 * absolute URL.  The base URL is often the URL of a page,* and the relative URL is a URL embedded on that page.
 *
 * This function implements the \"absolutize\" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,* per the URL specification.
 *
 * Parameters:
 *  baseUrl     the absolute base URL.
 *
 *  url     the relative URL to convert.
 *
 * Return values:
 *  An absolute URL that combines parts of the base and relative
 *  URLs,or FALSE if the base URL is not absolute or if either
 *  URL cannot be parsed.
 */
function url_to_absolute( $baseUrl,$relativeUrl )
{
    // If relative URL has a scheme,clean path and return.
    $r = split_url( $relativeUrl );
    if ( $r === FALSE )
        return FALSE;
    if ( !empty( $r[\'scheme\'] ) )
    {
        if ( !empty( $r[\'path\'] ) && $r[\'path\'][0] == \'/\' )
            $r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
        return join_url( $r );
    }

    // Make sure the base URL is absolute.
    $b = split_url( $baseUrl );
    if ( $b === FALSE || empty( $b[\'scheme\'] ) || empty( $b[\'host\'] ) )
        return FALSE;
    $r[\'scheme\'] = $b[\'scheme\'];

    // If relative URL has an authority,clean path and return.
    if ( isset( $r[\'host\'] ) )
    {
        if ( !empty( $r[\'path\'] ) )
            $r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
        return join_url( $r );
    }
    unset( $r[\'port\'] );
    unset( $r[\'user\'] );
    unset( $r[\'pass\'] );

    // copy base authority.
    $r[\'host\'] = $b[\'host\'];
    if ( isset( $b[\'port\'] ) ) $r[\'port\'] = $b[\'port\'];
    if ( isset( $b[\'user\'] ) ) $r[\'user\'] = $b[\'user\'];
    if ( isset( $b[\'pass\'] ) ) $r[\'pass\'] = $b[\'pass\'];

    // If relative URL has no path,use base path
    if ( empty( $r[\'path\'] ) )
    {
        if ( !empty( $b[\'path\'] ) )
            $r[\'path\'] = $b[\'path\'];
        if ( !isset( $r[\'query\'] ) && isset( $b[\'query\'] ) )
            $r[\'query\'] = $b[\'query\'];
        return join_url( $r );
    }

    // If relative URL path doesn\'t start with /,merge with base path
    if ( $r[\'path\'][0] != \'/\' )
    {
        $base = mb_strrchr( $b[\'path\'],\'/\',TRUE,\'UTF-8\' );
        if ( $base === FALSE ) $base = \'\';
        $r[\'path\'] = $base . \'/\' . $r[\'path\'];
    }
    $r[\'path\'] = url_remove_dot_segments( $r[\'path\'] );
    return join_url( $r );
}

/**
 * Filter out \".\" and \"..\" segments from a URL\'s path and return
 * the result.
 *
 * This function implements the \"remove_dot_segments\" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,* per the URL specification.
 *
 * Parameters:
 *  path    the path to filter
 *
 * Return values:
 *  The filtered path with \".\" and \"..\" removed.
 */
function url_remove_dot_segments( $path )
{
    // multi-byte character explode
    $inSegs  = preg_split( \'!/!u\',$path );
    $outSegs = array( );
    foreach ( $inSegs as $seg )
    {
        if ( $seg == \'\' || $seg == \'.\')
            continue;
        if ( $seg == \'..\' )
            array_pop( $outSegs );
        else
            array_push( $outSegs,$seg );
    }
    $outPath = implode( \'/\',$outSegs );
    if ( $path[0] == \'/\' )
        $outPath = \'/\' . $outPath;
    // compare last multi-byte character against \'/\'
    if ( $outPath != \'/\' &&
        (mb_strlen($path)-1) == mb_strrpos( $path,\'UTF-8\' ) )
        $outPath .= \'/\';
    return $outPath;
}


/**
 * This function parses an absolute or relative URL and splits it
 * into individual components.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *  URI-reference   = URI
 *          / relative-ref
 *
 *  URI     = scheme \":\" hier-part [ \"?\" query ] [ \"#\" fragment ]
 *
 *  relative-ref    = relative-part [ \"?\" query ] [ \"#\" fragment ]
 *
 *  hier-part   = \"//\" authority path-abempty
 *          / path-absolute
 *          / path-rootless
 *          / path-empty
 *
 *  relative-part   = \"//\" authority path-abempty
 *          / path-absolute
 *          / path-noscheme
 *          / path-empty
 *
 *  authority   = [ userinfo \"@\" ] host [ \":\" port ]
 *
 * So,a URL has the following major components:
 *
 *  scheme
 *      The name of a method used to interpret the rest of
 *      the URL.  Examples:  \"http\",\"https\",\"mailto\",\"file\'.
 *
 *  authority
 *      The name of the authority governing the URL\'s name
 *      space.  Examples:  \"example.com\",\"user@example.com\",*      \"example.com:80\",\"user:password@example.com:80\".
 *
 *      The authority may include a host name,port number,*      user name,and password.
 *
 *      The host may be a name,an IPv4 numeric address,or
 *      an IPv6 numeric address.
 *
 *  path
 *      The hierarchical path to the URL\'s resource.
 *      Examples:  \"/index.htm\",\"/scripts/page.PHP\".
 *
 *  query
 *      The data for a query.  Examples:  \"?search=google.com\".
 *
 *  fragment
 *      The name of a secondary resource relative to that named
 *      by the path.  Examples:  \"#section1\",\"#header\".
 *
 * An \"absolute\" URL must include a scheme and path.  The authority,query,* and fragment components are optional.
 *
 * A \"relative\" URL does not include a scheme and must include a path.  The
 * authority,and fragment components are optional.
 *
 * This function splits the $url argument into the following components
 * and returns them in an associative array.  Keys to that array include:
 *
 *  \"scheme\"    The scheme,such as \"http\".
 *  \"host\"      The host name,IPv4,or IPv6 address.
 *  \"port\"      The port number.
 *  \"user\"      The user name.
 *  \"pass\"      The user password.
 *  \"path\"      The path,such as a file path for \"http\".
 *  \"query\"     The query.
 *  \"fragment\"  The fragment.
 *
 * One or more of these may not be present,depending upon the URL.
 *
 * Optionally,the \"user\",\"pass\",\"host\" (if a name,not an IP address),* \"path\",\"query\",and \"fragment\" may have percent-encoded characters
 * decoded.  The \"scheme\" and \"port\" cannot include percent-encoded
 * characters and are never decoded.  Decoding occurs after the URL has
 * been parsed.
 *
 * Parameters:
 *  url     the URL to parse.
 *
 *  decode      an optional boolean flag selecting whether
 *          to decode percent encoding or not.  Default = TRUE.
 *
 * Return values:
 *  the associative array of URL parts,or FALSE if the URL is
 *  too malformed to recognize any parts.
 */
function split_url( $url,$decode=FALSE)
{
    // Character sets from RFC3986.
    $xunressub     = \'a-zA-Z\\d\\-._~\\!$&\\\'()*+,;=\';
    $xpchar        = $xunressub . \':@% \';

    // Scheme from RFC3986.
    $xscheme        = \'([a-zA-Z][a-zA-Z\\d+-.]*)\';

    // User info (user + password) from RFC3986.
    $xuserinfo     = \'(([\'  . $xunressub . \'%]*)\' .
                     \'(:([\' . $xunressub . \':%]*))?)\';

    // IPv4 from RFC3986 (without digit constraints).
    $xipv4         = \'(\\d{1,3}\\.\\d{1,3})\';

    // IPv6 from RFC2732 (without digit and grouping constraints).
    $xipv6         = \'(\\[([a-fA-F\\d.:]+)\\])\';

    // Host name from RFC1035.  Technically,must start with a letter.
    // Relax that restriction to better parse URL structure,then
    // leave host name validation to application.
    $xhost_name    = \'([a-zA-Z\\d-.%]+)\';

    // Authority from RFC3986.  Skip IP future.
    $xhost         = \'(\' . $xhost_name . \'|\' . $xipv4 . \'|\' . $xipv6 . \')\';
    $xport         = \'(\\d*)\';
    $xauthority    = \'((\' . $xuserinfo . \'@)?\' . $xhost .
                 \'?(:\' . $xport . \')?)\';

    // Path from RFC3986.  Blend absolute & relative for efficiency.
    $xslash_seg    = \'(/[\' . $xpchar . \']*)\';
    $xpath_authabs = \'((//\' . $xauthority . \')((/[\' . $xpchar . \']*)*))\';
    $xpath_rel     = \'([\' . $xpchar . \']+\' . $xslash_seg . \'*)\';
    $xpath_abs     = \'(/(\' . $xpath_rel . \')?)\';
    $xapath        = \'(\' . $xpath_authabs . \'|\' . $xpath_abs .
             \'|\' . $xpath_rel . \')\';

    // Query and fragment from RFC3986.
    $xqueryfrag    = \'([\' . $xpchar . \'/?\' . \']*)\';

    // URL.
    $xurl          = \'^(\' . $xscheme . \':)?\' .  $xapath . \'?\' .
                     \'(\\?\' . $xqueryfrag . \')?(#\' . $xqueryfrag . \')?$\';


    // Split the URL into components.
    if ( !preg_match( \'!\' . $xurl . \'!\',$url,$m ) )
        return FALSE;

    if ( !empty($m[2]) )        $parts[\'scheme\']  = strtolower($m[2]);

    if ( !empty($m[7]) ) {
        if ( isset( $m[9] ) )   $parts[\'user\']    = $m[9];
        else            $parts[\'user\']    = \'\';
    }
    if ( !empty($m[10]) )       $parts[\'pass\']    = $m[11];

    if ( !empty($m[13]) )       $h=$parts[\'host\'] = $m[13];
    else if ( !empty($m[14]) )  $parts[\'host\']    = $m[14];
    else if ( !empty($m[16]) )  $parts[\'host\']    = $m[16];
    else if ( !empty( $m[5] ) ) $parts[\'host\']    = \'\';
    if ( !empty($m[17]) )       $parts[\'port\']    = $m[18];

    if ( !empty($m[19]) )       $parts[\'path\']    = $m[19];
    else if ( !empty($m[21]) )  $parts[\'path\']    = $m[21];
    else if ( !empty($m[25]) )  $parts[\'path\']    = $m[25];

    if ( !empty($m[27]) )       $parts[\'query\']   = $m[28];
    if ( !empty($m[29]) )       $parts[\'fragment\']= $m[30];

    if ( !$decode )
        return $parts;
    if ( !empty($parts[\'user\']) )
        $parts[\'user\']     = rawurldecode( $parts[\'user\'] );
    if ( !empty($parts[\'pass\']) )
        $parts[\'pass\']     = rawurldecode( $parts[\'pass\'] );
    if ( !empty($parts[\'path\']) )
        $parts[\'path\']     = rawurldecode( $parts[\'path\'] );
    if ( isset($h) )
        $parts[\'host\']     = rawurldecode( $parts[\'host\'] );
    if ( !empty($parts[\'query\']) )
        $parts[\'query\']    = rawurldecode( $parts[\'query\'] );
    if ( !empty($parts[\'fragment\']) )
        $parts[\'fragment\'] = rawurldecode( $parts[\'fragment\'] );
    return $parts;
}


/**
 * This function joins together URL components to form a complete URL.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * This function implements the specification\'s \"component recomposition\"
 * algorithm for combining URI components into a full URI string.
 *
 * The $parts argument is an associative array containing zero or
 * more of the following:
 *
 *  \"scheme\"    The scheme,such as a file path for \"http\".
 *  \"query\"     The query.
 *  \"fragment\"  The fragment.
 *
 * The \"port\",\"user\",and \"pass\" values are only used when a \"host\"
 * is present.
 *
 * The optional $encode argument indicates if appropriate URL components
 * should be percent-encoded as they are assembled into the URL.  Encoding
 * is only applied to the \"user\",\"host\" (if a host name,not an
 * IP address),\"path\",and \"fragment\" components.  The \"scheme\"
 * and \"port\" are never encoded.  When a \"scheme\" and \"host\" are both
 * present,the \"path\" is presumed to be hierarchical and encoding
 * processes each segment of the hierarchy separately (i.e.,the slashes
 * are left alone).
 *
 * The assembled URL string is returned.
 *
 * Parameters:
 *  parts       an associative array of strings containing the
 *          individual parts of a URL.
 *
 *  encode      an optional boolean flag selecting whether
 *          to do percent encoding or not.  Default = true.
 *
 * Return values:
 *  Returns the assembled URL string.  The string is an absolute
 *  URL if a scheme is supplied,and a relative URL if not.  An
 *  empty string is returned if the $parts array does not contain
 *  any of the needed values.
 */
function join_url( $parts,$encode=FALSE)
{
    if ( $encode )
    {
        if ( isset( $parts[\'user\'] ) )
            $parts[\'user\']     = rawurlencode( $parts[\'user\'] );
        if ( isset( $parts[\'pass\'] ) )
            $parts[\'pass\']     = rawurlencode( $parts[\'pass\'] );
        if ( isset( $parts[\'host\'] ) &&
            !preg_match( \'!^(\\[[\\da-f.:]+\\]])|([\\da-f.:]+)$!ui\',$parts[\'host\'] ) )
            $parts[\'host\']     = rawurlencode( $parts[\'host\'] );
        if ( !empty( $parts[\'path\'] ) )
            $parts[\'path\']     = preg_replace( \'!%2F!ui\',rawurlencode( $parts[\'path\'] ) );
        if ( isset( $parts[\'query\'] ) )
            $parts[\'query\']    = rawurlencode( $parts[\'query\'] );
        if ( isset( $parts[\'fragment\'] ) )
            $parts[\'fragment\'] = rawurlencode( $parts[\'fragment\'] );
    }

    $url = \'\';
    if ( !empty( $parts[\'scheme\'] ) )
        $url .= $parts[\'scheme\'] . \':\';
    if ( isset( $parts[\'host\'] ) )
    {
        $url .= \'//\';
        if ( isset( $parts[\'user\'] ) )
        {
            $url .= $parts[\'user\'];
            if ( isset( $parts[\'pass\'] ) )
                $url .= \':\' . $parts[\'pass\'];
            $url .= \'@\';
        }
        if ( preg_match( \'!^[\\da-f]*:[\\da-f.:]+$!ui\',$parts[\'host\'] ) )
            $url .= \'[\' . $parts[\'host\'] . \']\'; // IPv6
        else
            $url .= $parts[\'host\'];         // IPv4 or name
        if ( isset( $parts[\'port\'] ) )
            $url .= \':\' . $parts[\'port\'];
        if ( !empty( $parts[\'path\'] ) && $parts[\'path\'][0] != \'/\' )
            $url .= \'/\';
    }
    if ( !empty( $parts[\'path\'] ) )
        $url .= $parts[\'path\'];
    if ( isset( $parts[\'query\'] ) )
        $url .= \'?\' . $parts[\'query\'];
    if ( isset( $parts[\'fragment\'] ) )
        $url .= \'#\' . $parts[\'fragment\'];
    return $url;
}

/**
 * This function encodes URL to form a URL which is properly 
 * percent encoded to replace disallowed characters.
 *
 * RFC3986 specifies the allowed characters in the URL as well as
 * reserved characters in the URL. This function replaces all the 
 * disallowed characters in the URL with their repective percent 
 * encodings. Already encoded characters are not encoded again,* such as \'%20\' is not encoded to \'%2520\'.
 *
 * Parameters:
 *  url     the url to encode.
 *
 * Return values:
 *  Returns the encoded URL string. 
 */
function encode_url($url) {
  $reserved = array(
    \":\" => \'!%3A!ui\',\"/\" => \'!%2F!ui\',\"?\" => \'!%3F!ui\',\"#\" => \'!%23!ui\',\"[\" => \'!%5B!ui\',\"]\" => \'!%5D!ui\',\"@\" => \'!%40!ui\',\"!\" => \'!%21!ui\',\"$\" => \'!%24!ui\',\"&\" => \'!%26!ui\',\"\'\" => \'!%27!ui\',\"(\" => \'!%28!ui\',\")\" => \'!%29!ui\',\"*\" => \'!%2A!ui\',\"+\" => \'!%2B!ui\',\",\" => \'!%2C!ui\',\";\" => \'!%3B!ui\',\"=\" => \'!%3D!ui\',\"%\" => \'!%25!ui\',);

  $url = rawurlencode($url);
  $url = preg_replace(array_values($reserved),array_keys($reserved),$url);
  return $url;
}

?>
    

解决方法

您尚未真正说出您遇到了什么错误,但是幸运的是,您的代码中有几个错误。可能会给您带来错误的代码在此块中:
if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {   
    $maxsize = $arr[0] * $arr[1];  //compare images\' sise
    $the_biggest_image = $comm;
    echo \'<img src=\"\'.$the_biggest_image.\'\" />\'; //echo the biggest one
}
您重写“ 3”,它是您的“ ads”过滤变量。 如果ѭ4false为假,您仍需执行以下计算语句
reset()
不适用于弦。 每当您更新最大尺寸时,您都会回显
$the_biggest_image
。那是故意的吗? 更新 尝试使您的代码正常工作,并希望稍微好一些:
<?php
require_once \'simple_html_dom.php\';
require \'url_to_absolute.php\'; //get image absolute url
// options
$url = \'http://www.yomiuri.co.jp/stream/\';
$ignore = array(\'ad\',\'ads\',\'gif\');// add ads possible words as a arry which is check in the image url
$biggestImage = \'path to \"no image found\" image\';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts[\'scheme\'].\'://\'.$parts[\'host\'];
// loop images
foreach($html->find(\'img\') as $element) {
    $pic = $element->src;
    if($pic==\'\')continue;// it happens on your test url
    $absUrl = url_to_absolute($host,$pic);//get image absolute url
    // ignore already seen images,add new images
    if(in_array($absUrl,$visited))continue;
    $visited[]=$absUrl;
    // remove ads images
    $ignoring=false;
    foreach($ignore as $item)
        if (stripos($absUrl,$item)!==false){
            $ignoring=true;
            break;
        }
    if($ignoring)continue;
    // get image
    $image=@getimagesize($absUrl);// get the rest images width and height
    if (($image[0] * $image[1]) > $maxSize) {   
        $maxSize = $image[0] * $image[1];  //compare images\' sise
        $biggestImage = $absUrl;
    }
}
echo \'<img src=\"\'.$biggestImage.\'\" />\'; //echo the biggest one
?>
    ,根据您的代码,我创建了以下解决方案-它使用相同的逻辑,并且可以让您设置图像的最小宽度和高度,以确保其返回正确的图像
private function getMainImageFromUrl($pageUrl) {

    $biggestImage = \'\';
    $minImgWidth = 300;
    $minImgHeight = 300;
    $images = $this->getImagesFromDom($pageUrl);
    $visited = array();
    $maxSize = -1;
    $ignore = array(\'ad\',\'gif\'); // get rid of ads (check if these contains following)

    foreach ($images as $image) {
        $pic = $image->getAttribute(\'src\');
        # if source is empty,skip to another image
        if ( empty( $pic ) )
            continue;
        # get image absolute url
        $absUrl = url_to_absolute($pic);
        # ignore already seen images (skip to another),add new images
        if ( in_array( $absUrl,$visited ) )
            continue;
        $visited[] = $absUrl;
        # remove ads
        $ignoring = false;
        foreach($ignore as $item)
            if ( stripos( $absUrl,$item ) !== false ){

                $ignoring=true;
                break;

            }
        if ( $ignoring )
            continue;
        $imageSize = @getimagesize($absUrl);
        if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
            $maxSize = $imageSize[0] * $imageSize[1];
            if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
                $biggestImage = $absUrl;
        }
    }
    return $biggestImage;
}

private function getImagesFromDom( $url ) {
    ini_set(\'default_socket_timeout\',4);
    $dom = new DOMDocument();
    @$dom->loadHTMLFile( $url );
    $dom->preserveWhiteSpace = false;

    # Get images from DOM
    return $dom->getElementsByTagName(\'img\');
}