skip to Main Content

I am trying to get sentiment scores of random product descriptions from a CSV file, I’m facing a problem with what I think is the API response time, not sure if I’m traversing through the CSV using the API incorrectly / un-efficiently but it is taking a long time to get results for all the 300+ entries in the CSV and whenever I want to push new changes to my codebase I need to wait for the API to re-evaluate the entries every time, here is my code I made for loading in the CSV file and for getting the sentiment scores

    <?php

set_time_limit(500); // extended timeout due to slow / overwhelmed API response

function extract_file($csv) { // CSV to array function

    $file = fopen($csv, 'r');

    while (!feof($file)) {
        $lines[] = fgetcsv($file, 1000, ',');
    }

    fclose($file);
    return $lines;

}

$the_file = 'dataset.csv';
$csv_data = extract_file($the_file);



$response_array = []; // array container to hold returned sentiment values from among prduct descriptions

for($x = 1; $x < count($csv_data) - 1; $x++) { // loop through all descriptions
    echo $x; // show iteration
    $api_text = $csv_data[$x][1];
    $api_text = str_replace('&', ' and ', $api_text); // removing escape sequence characters, '&' breaks the api :)
    $api_text = str_replace(" ", "%20", $api_text); // serializing string
    $text = 'text=';
    $text .=$api_text; // serializing string further for the API
    //echo 'current text1: ', $api_text;
    $curl = curl_init(); // API request init

    curl_setopt_array($curl, [
        CURLOPT_URL => "https://text-sentiment.p.rapidapi.com/analyze",
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_ENCODING => "",
        CURLOPT_MAXREDIRS => 10,
        CURLOPT_TIMEOUT => 30,
        CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
        CURLOPT_CUSTOMREQUEST => "POST",
        CURLOPT_POSTFIELDS => $text,
        CURLOPT_HTTPHEADER => [
            "X-RapidAPI-Host: text-sentiment.p.rapidapi.com",
            "X-RapidAPI-Key: <snip>",
            "content-type: application/x-www-form-urlencoded"
        ],
    ]);

    $response = curl_exec($curl);
    $err = curl_error($curl);

    curl_close($curl);

    if ($err) {
        echo "cURL Error #:" . $err;
    } else {
        echo $response;
    }


    $json = json_decode($response, true); // convert response to JSON format
    
    if(isset($json["pos"]) == false) { // catching response error 100, makes array faulty otherwise
        continue;
    }
    else {
        array_push($response_array, array($x, "+" => $json["pos"], "-" => $json["neg"])); // appends array with sentiment values at current index
    }
    
}

echo "<br>";
echo "<br> results: ";

echo "<p>";
for ($y = 0; $y < count($response_array); $y++){ // prints out all the sentiment values
    echo "<br>";
    echo print_r($response_array[$y]);
    echo "<br>";
}
echo "</p>";

echo "<br>the most negative description: ";
$max_neg = array_keys($response_array, max(array_column($response_array, '-')));
//$max_neg = max(array_column($response_array, '-'));
echo print_r($csv_data[$max_neg[0]]);

echo "<br>the most positive description: ";
$max_pos = array_keys($response_array, max(array_column($response_array, '+')));
echo print_r($csv_data[$max_pos[0]]);


?>

What this code snippet aims to do is find the most negative and most positive sentiment among the description column in the csv and print them out according to their index, I’m only interested in finding descriptions with the highest amount of positive and negative sentiment word number not the percentage of the overall sentiment

The file can be found in this git repo

Thanks for any suggestions

2

Answers


  1. This can be achieved by creating a cache file.

    This solution creates a file cache.json that contains the results from the API, using the product name as the key for each entry.

    On subsequent calls, it will use the cache value if it exists.

    set_time_limit(500);
    
    function file_put_json($file, $data)
    {
        $json = json_encode($data, JSON_PRETTY_PRINT);
        file_put_contents($file, $json);
    }
    
    function file_get_json($file, $as_array=false)
    {
        return json_decode(file_get_contents($file), $as_array);
    }
    
    function file_get_csv($file, $header_row=true)
    {
        $handle = fopen($file, 'r');
        
        if ($header_row === true)
            $header = fgetcsv($handle);
    
        $array = [];
        while ($row = fgetcsv($handle)) {
            if ($header_row === true) {
                $array[] = array_combine($header, array_map('trim', $row));
            } else {
                $array[] = array_map('trim', $row);
            }
        }
        fclose($handle);
        return $array;
    }
    
    function call_sentiment_api($input)
    {
        $text = 'text=' . $input;
        $curl = curl_init();
    
        curl_setopt_array($curl, [
            CURLOPT_URL => "https://text-sentiment.p.rapidapi.com/analyze",
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_ENCODING => "",
            CURLOPT_MAXREDIRS => 10,
            CURLOPT_TIMEOUT => 30,
            CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
            CURLOPT_CUSTOMREQUEST => "POST",
            CURLOPT_POSTFIELDS => $text,
            CURLOPT_HTTPHEADER => [
                "X-RapidAPI-Host: text-sentiment.p.rapidapi.com",
                "X-RapidAPI-Key: <snip>",
                "content-type: application/x-www-form-urlencoded"
            ],
        ]);
    
        $response = curl_exec($curl);
        $err = curl_error($curl);
    
        curl_close($curl);
    
        if ($err) {
            throw new Exception("cURL Error #:" . $err);
        }
    
        return $response;
    }
    
    $csv_data = file_get_csv('dataset.csv');
    
    if (file_exists('cache.json')) {
        $cache_data = file_get_json('cache.json', true);
    } else {
        $cache_data = [];
    }
    
    $cache_names = array_keys($cache_data);
    
    $output = [];
    
    foreach ($csv_data as $csv) {
        $product_name = $csv['name'];
        echo $product_name . '...';
    
        if (in_array($product_name, $cache_names)) {
            echo 'CACHED...' . PHP_EOL;
    
            continue;
        }
    
        $description = urlencode(str_replace('&', ' and ', $csv['description']));
    
        $response = call_sentiment_api($description);
        
        echo 'API...' . PHP_EOL;
    
        $json = json_decode($response, true);
    
        $cache_data[$product_name] = $json;
    }
    
    file_put_json('cache.json', $cache_data);
    
    echo 'SAVE CACHE!' . PHP_EOL . PHP_EOL;
    
    $highest_pos = 0;
    $highest_neg = 0;
    
    $pos = [];
    $neg = [];
    
    foreach ($cache_data as $name => $cache) {
        if (!isset($cache['pos']) || !isset($cache['neg'])) {
            continue;
        }
        if ($cache['pos'] > $highest_pos) {
            $pos = [$name => $cache];
            $highest_pos = $cache['pos'];
        }
        if ($cache['pos'] === $highest_pos) {
            $pos[$name] = $cache;
        }
        if ($cache['neg'] > $highest_neg) {
            $neg = [$name => $cache];
            $highest_neg = $cache['neg'];
        }
        if ($cache['neg'] === $highest_neg) {
            $neg[$name] = $cache;
        }
    }
    
    echo "Most Positive Sentiment: " . $highest_pos . PHP_EOL;
    foreach ($pos as $name => $pos_) {
        echo "t" . $name . PHP_EOL;
    }
    echo PHP_EOL;
    
    echo "Most Negative Sentiment: " . $highest_neg . PHP_EOL;
    foreach ($neg as $name => $neg_) {
        echo "t" . $name . PHP_EOL;
    }
    

    Results in:

    Most Positive Sentiment: 4
            X-Grip Lifting Straps - GymBeam
            Beta Carotene - GymBeam
            Chelated Magnesium - GymBeam
            Creatine Crea7in - GymBeam
            L-carnitine 1000 mg - GymBeam - 20 tabs
            Resistance Band Set - GymBeam
    
    Most Negative Sentiment: 2
            Calorie free Ketchup sauce 320 ml - GymBeam
            ReHydrate Hypotonic Drink 1000 ml - GymBeam
            Vitamin E 60 caps - GymBeam
            Vitamin B-Complex 120 tab - GymBeam
            Zero Syrup Hazelnut Choco 350 ml - GymBeam
            Bio Psyllium - GymBeam
            Zero calorie Vanilla Syrup - GymBeam
    
    Login or Signup to reply.
  2. You need to know where the time is going.

    Start with identifying where the time goes in the curl request.
    My guess is the API response time.

    If that’s the case I have a solution. Meanwhile I will get the "multi-tasking" code code I use to do simultaneous curl requests.

    curl has the timing you need. It looks like this:

      'total_time' => 0.029867,
      'namelookup_time' => 0.000864,
      'connect_time' => 0.001659,
      'pretransfer_time' => 0.00988,
      'size_upload' => 0.0,
      'size_download' => 8300.0,
      'speed_download' => 277898.0,
      'speed_upload' => 0.0,
    

    Just add a couple of lines of code

    $response = curl_exec($curl);
    $info = var_export(curl_getinfo($curl),true);
    file_put_contents('timing.txt',$info,FILE_APPEND);
    

    Running simultaneous curl sockets.

    Put your curl in curl.php

      $text = $_GET['text'];
      curl_setopt_array($curl, [
            CURLOPT_URL => "https://text-sentiment.p.rapidapi.com/analyze",
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_ENCODING => "",
            CURLOPT_MAXREDIRS => 10,
            CURLOPT_TIMEOUT => 30,
            CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
            CURLOPT_CUSTOMREQUEST => "POST",
            CURLOPT_POSTFIELDS => $text,
            CURLOPT_HTTPHEADER => [
                "X-RapidAPI-Host: text-sentiment.p.rapidapi.com",
                "X-RapidAPI-Key: <snip>",
                "content-type: application/x-www-form-urlencoded"
            ],
        ]);
    

    This code goes in your CSV loop to create all the URL query fields to pass to curl.php (e.g. http://127.0.0.1/curl.php?text=$text)

    $query = urlencode($text);
    $urls[] = array('host' => "127.0.0.1",'path' => "/curl.php?text=$query
    

    Then process all the URLs.

      foreach($urls as $path){
        $host = $path['host'];
        $path = $path['path'];
        $http = "GET $path HTTP/1.0rnHost: $hostrnrn";
        $stream = stream_socket_client("$host:80", $errno,$errstr, 120,STREAM_CLIENT_ASYNC_CONNECT|STREAM_CLIENT_CONNECT); 
        if ($stream) {
          $sockets[] = $stream;  // supports multiple sockets
          fwrite($stream, $http);
        }
        else { 
          $err .=  "$id Failed<br>n";
        }
      }
    

    Then Monitor the sockets and retrieve the response from each socket.
    Then close the socket until you have them all.

    while (count($sockets)) {
      $read = $sockets; 
      stream_select($read, $write = NULL, $except = NULL, $timeout);
      if (count($read)) {
        foreach ($read as $r) { 
          $id = array_search($r, $sockets); 
          $data = fread($r, $buffer_size); 
          if (strlen($data) == 0) { 
         //   echo "$id Closed: " . date('h:i:s') . "nnn";
            $closed[$id] = microtime(true);
            fclose($r); 
            unset($sockets[$id]);
          } 
          else {
            $results[$id] .= $data; 
          }
        }
      }
      else { 
     //   echo 'Timeout: ' . date('h:i:s') . "nnn";
        break;
      }
    }  
    

    Then all your results are in $results[].


    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search