Skip to content

Commit

Permalink
Issue codeplea#1 - update your utility
Browse files Browse the repository at this point in the history
* Move benchmarking functionality into its own class.
  • Loading branch information
traack_lcruz committed Oct 1, 2018
1 parent 3ae023c commit a93fcb2
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 101 deletions.
107 changes: 6 additions & 101 deletions benchmark.php
Original file line number Diff line number Diff line change
@@ -1,106 +1,11 @@
<?php
use codeplea\AhoCorasick\Search;

/* This program will benchmark searching for 1,000 keywords in a 5,000 word text all at once. */
/* It compares our ahocorasick method with regex and strpos. */

use codeplea\AhoCorasick\Benchmark;

require 'vendor/autoload.php';
require 'benchmark_setup.php'; /* keywords and text */

$loops = 10;

print('Loaded ' . count($needles) . ' keywords to search on a text of ' .
strlen($haystack) . " characters.\n");

print("\nSearching with strpos...\n");

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
foreach ($needles as $n) {
$k = 0;
while (($k = strpos($haystack, $n, $k)) !== false) {
$found[] = array($n, $k);
++$k;
}
}
}
$et = microtime(1);
print('time: ' . ($et - $st) . "\n");
$found_strpos = $found;


print("\nSearching with preg_match...\n");
//Note, this actually sucks and misses cases where one needle is a prefix or
//suffix of another.
$regex = '/' . implode('|', $needles) . '/';

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) {
$found[] = $m[0];
$k = $m[0][1] + 1;
}
}
$et = microtime(1);
print('time: ' . ($et - $st) . "\n");


print("\nSearching with preg_match_all...\n");
//Note, this actually sucks and misses cases where one needle is a prefix or
//suffix of another.
$regex = '/' . implode('|', $needles) . '/';

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE);
$found = $found[0];
}
$et = microtime(1);
print('time: ' . ($et - $st) . "\n");


print("\nSearching with aho corasick...\n");
$ac = new Search();
foreach ($needles as $n) {
$ac->addNeedle($n);
}
$ac->finalize();

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = $ac->execute($haystack);
}
$et = microtime(1);
print('time: ' . ($et - $st) . "\n");


//Check that the answers match.
//First sort the arrays.
$comp = function ($a, $b) {
return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);
};
usort($found, $comp);
usort($found_strpos, $comp);

if ($found_strpos !== $found) {
print("ERROR - Aho Corasick got the wrong result.\n");

print('strpos size: ' . count($found_strpos) . "\n");
print('aho corasick size: ' . count($found) . "\n");

$numberFound = count($found);
/* keywords and text */
require 'benchmark_setup.php';

for ($i = 0; $i < $numberFound; ++$i) {
if ($found_strpos[$i] !== $found[$i]) {
print("Mismatch $i\n");
print_r($found_strpos[$i]);
print_r($found[$i]);
}
}
}
// Benchmark searching for 1,000 keywords in a 5,000 word text all at once.
$benchmark = new Benchmark();
$benchmark->run($needles, $haystack);
147 changes: 147 additions & 0 deletions src/Benchmark.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
<?php
namespace codeplea\AhoCorasick;

use codeplea\AhoCorasick\Search;

class Benchmark
{
protected $foundStrpos = [];

/**
* @param array $needles
* @param string $haystack
* @param int $loops
*/
protected function benchmarkStrpos(array $needles, string $haystack, int $loops)
{
print "\nSearching with strpos...\n";

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = [];
foreach ($needles as $n) {
$k = 0;
while (($k = strpos($haystack, $n, $k)) !== false) {
$found[] = [$n, $k];
++$k;
}
}
}

$et = microtime(1);
print 'time: ' . ($et - $st) . "\n";
$this->foundStrpos = $found;
}

/**
* @param array $needles
* @param string $haystack
* @param int $loops
*/
protected function benchmarkPregMatch(array $needles, string $haystack, int $loops)
{
print "\nSearching with preg_match...\n";

// Note, this actually sucks and misses cases where one needle is a prefix or
// suffix of another.
$regex = '/' . implode('|', $needles) . '/';

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$k = 0;
while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) {
$k = $m[0][1] + 1;
}
}
$et = microtime(1);
print 'time: ' . ($et - $st) . "\n";
}

/**
* @param array $needles
* @param string $haystack
* @param int $loops
*/
protected function benchmarkPregMatchAll(array $needles, string $haystack, int $loops)
{
print "\nSearching with preg_match_all...\n";

// Note, this actually sucks and misses cases where one needle is a prefix or
// suffix of another.
$regex = '/' . implode('|', $needles) . '/';

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE);
}
$et = microtime(1);
print 'time: ' . ($et - $st) . "\n";
}

/**
* @param array $needles
* @param string $haystack
* @param int $loops
* @throws \Exception
*/
protected function benchmarkAhoCorasick(array $needles, string $haystack, int $loops)
{
print "\nSearching with aho corasick...\n";

$ac = new Search();
foreach ($needles as $n) {
$ac->addNeedle($n);
}
$ac->finalize();

$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = $ac->execute($haystack);
}
$et = microtime(1);
print 'time: ' . ($et - $st) . "\n";

// Check that the answers match.
// First sort the arrays.
$comp = function ($a, $b) {
return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);
};
usort($found, $comp);
usort($this->foundStrpos, $comp);

if ($this->foundStrpos !== $found) {
print "ERROR - Aho Corasick got the wrong result.\n";

print 'strpos size: ' . count($this->foundStrpos) . "\n";
print 'aho corasick size: ' . count($found) . "\n";

$numberFound = count($found);

for ($i = 0; $i < $numberFound; ++$i) {
if ($this->foundStrpos[$i] !== $found[$i]) {
print "Mismatch $i\n";
print_r($this->foundStrpos[$i]);
print_r($found[$i]);
}
}
}
}

/**
* Compares the performance of Aho Corasick against strpos, preg_match, and preg_match_all
*
* @param array $needles
* @param string $haystack
* @param int $loops
* @throws \Exception
*/
public function run(array $needles, string $haystack, int $loops = 10)
{
print 'Loaded ' . count($needles) . ' keywords to search on a text of ' . strlen($haystack) . " characters.\n";

$this->benchmarkStrpos($needles, $haystack, $loops);
$this->benchmarkPregMatch($needles, $haystack, $loops);
$this->benchmarkPregMatchAll($needles, $haystack, $loops);
$this->benchmarkAhoCorasick($needles, $haystack, $loops);
}
}

0 comments on commit a93fcb2

Please sign in to comment.