skip to Main Content

I’ve got double foreach loop. Script takes urls from one file and tries to find it in html code of pages from another file. Of course that reading so many pages is pretty hard for server so I want to optimize script but how can I do it?

Here is the code:

<?php
$sites_raw = file('https://earnmoneysafe.com/script/sites.txt');
$sites = array_map('trim', $sites_raw);
$urls_raw = file('https://earnmoneysafe.com/script/4toiskatj.txt');
$urls = array_map('trim', $urls_raw);

function file_get_contents_curl($url) {
    $ch = curl_init();
    $config['useragent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0';

    curl_setopt($curl, CURLOPT_USERAGENT, $config['useragent']);
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);       

    $data = curl_exec($ch);
    curl_close($ch);

    return $data;
}

foreach ($sites as $site){
    $homepage = file_get_contents_curl($site);
    foreach ($urls as $url){
        $needle   = $url;
        if (strpos($homepage, $needle) !== false) {
            echo 'true';
        }
    }
}
?>

2

Answers


  1. Use curl_multi_exec() to fetch all the URLs in parallel.

    $urls = file('https://earnmoneysafe.com/script/4toiskatj.txt', FILE_IGNORE_NEW_LINES);
    $sites = file('https://earnmoneysafe.com/script/sites.txt', FILE_IGNORE_NEW_LINES);
    foreach ($sites as $site) {
        $curl_handles[$site] = get_curl($site);
    }
    $mh = curl_multi_init();
    foreach ($curl_handles as $ch) {
        curl_multi_add_handle($mh, $ch);
    }
    
    do {
        $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
    
    foreach ($curl_handles as $site => $ch) {
        $homepage = curl_multi_getcontent($ch);
        foreach ($urls as $needle) {
            if (strpos($homepage, $needle) !== false) {
                echo 'true';
            }
        }
        curl_multi_remove_handle($mh, $ch);
    }
    
    curl_multi_close($mh);
        
    function get_curl($url) {
        $ch = curl_init();
        $config['useragent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0';
    
        curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']); // edited  
        curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);       
    
        return $ch;
    }
    
    Login or Signup to reply.
  2. I think this, This code is cleaner

    <?php
    
    const SITES_URL = 'https://earnmoneysafe.com/script/sites.txt';
    const URLS_URL = 'https://earnmoneysafe.com/script/4toiskatj.txt';
    
    function readFileLines($url) {
        $file_contents = file_get_contents($url);
        $lines = explode("n", $file_contents);
        $filtered_lines = array_filter($lines, function($line) {
            return !empty(trim($line));
        });
    
        return $filtered_lines;
    }
    
    function checkSiteUrls($site, $urls) {
        $homepage = file_get_contents($site);
        foreach ($urls as $url) {
            if (strpos($homepage, $url) !== false) {
                echo 'true';
            }
        }
    }
    
    $sites = readFileLines(SITES_URL);
    $urls = readFileLines(URLS_URL);
    
    foreach ($sites as $site) {
        checkSiteUrls($site, $urls);
    }
    
    ?>
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search