forked from codeplea/ahocorasickphp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Issue codeplea#1 - update your utility
* Move benchmarking functionality into its own class.
- Loading branch information
traack_lcruz
committed
Oct 1, 2018
1 parent
3ae023c
commit a93fcb2
Showing
2 changed files
with
153 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,106 +1,11 @@ | ||
<?php | ||
use codeplea\AhoCorasick\Search; | ||
|
||
/* This program will benchmark searching for 1,000 keywords in a 5,000 word text all at once. */ | ||
/* It compares our ahocorasick method with regex and strpos. */ | ||
|
||
use codeplea\AhoCorasick\Benchmark; | ||
|
||
require 'vendor/autoload.php'; | ||
require 'benchmark_setup.php'; /* keywords and text */ | ||
|
||
$loops = 10; | ||
|
||
print('Loaded ' . count($needles) . ' keywords to search on a text of ' . | ||
strlen($haystack) . " characters.\n"); | ||
|
||
print("\nSearching with strpos...\n"); | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = array(); | ||
foreach ($needles as $n) { | ||
$k = 0; | ||
while (($k = strpos($haystack, $n, $k)) !== false) { | ||
$found[] = array($n, $k); | ||
++$k; | ||
} | ||
} | ||
} | ||
$et = microtime(1); | ||
print('time: ' . ($et - $st) . "\n"); | ||
$found_strpos = $found; | ||
|
||
|
||
print("\nSearching with preg_match...\n"); | ||
//Note, this actually sucks and misses cases where one needle is a prefix or | ||
//suffix of another. | ||
$regex = '/' . implode('|', $needles) . '/'; | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = array(); | ||
$k = 0; | ||
while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) { | ||
$found[] = $m[0]; | ||
$k = $m[0][1] + 1; | ||
} | ||
} | ||
$et = microtime(1); | ||
print('time: ' . ($et - $st) . "\n"); | ||
|
||
|
||
print("\nSearching with preg_match_all...\n"); | ||
//Note, this actually sucks and misses cases where one needle is a prefix or | ||
//suffix of another. | ||
$regex = '/' . implode('|', $needles) . '/'; | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = array(); | ||
$k = 0; | ||
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE); | ||
$found = $found[0]; | ||
} | ||
$et = microtime(1); | ||
print('time: ' . ($et - $st) . "\n"); | ||
|
||
|
||
print("\nSearching with aho corasick...\n"); | ||
$ac = new Search(); | ||
foreach ($needles as $n) { | ||
$ac->addNeedle($n); | ||
} | ||
$ac->finalize(); | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = $ac->execute($haystack); | ||
} | ||
$et = microtime(1); | ||
print('time: ' . ($et - $st) . "\n"); | ||
|
||
|
||
//Check that the answers match. | ||
//First sort the arrays. | ||
$comp = function ($a, $b) { | ||
return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]); | ||
}; | ||
usort($found, $comp); | ||
usort($found_strpos, $comp); | ||
|
||
if ($found_strpos !== $found) { | ||
print("ERROR - Aho Corasick got the wrong result.\n"); | ||
|
||
print('strpos size: ' . count($found_strpos) . "\n"); | ||
print('aho corasick size: ' . count($found) . "\n"); | ||
|
||
$numberFound = count($found); | ||
/* keywords and text */ | ||
require 'benchmark_setup.php'; | ||
|
||
for ($i = 0; $i < $numberFound; ++$i) { | ||
if ($found_strpos[$i] !== $found[$i]) { | ||
print("Mismatch $i\n"); | ||
print_r($found_strpos[$i]); | ||
print_r($found[$i]); | ||
} | ||
} | ||
} | ||
// Benchmark searching for 1,000 keywords in a 5,000 word text all at once. | ||
$benchmark = new Benchmark(); | ||
$benchmark->run($needles, $haystack); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
<?php | ||
namespace codeplea\AhoCorasick; | ||
|
||
use codeplea\AhoCorasick\Search; | ||
|
||
class Benchmark | ||
{ | ||
protected $foundStrpos = []; | ||
|
||
/** | ||
* @param array $needles | ||
* @param string $haystack | ||
* @param int $loops | ||
*/ | ||
protected function benchmarkStrpos(array $needles, string $haystack, int $loops) | ||
{ | ||
print "\nSearching with strpos...\n"; | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = []; | ||
foreach ($needles as $n) { | ||
$k = 0; | ||
while (($k = strpos($haystack, $n, $k)) !== false) { | ||
$found[] = [$n, $k]; | ||
++$k; | ||
} | ||
} | ||
} | ||
|
||
$et = microtime(1); | ||
print 'time: ' . ($et - $st) . "\n"; | ||
$this->foundStrpos = $found; | ||
} | ||
|
||
/** | ||
* @param array $needles | ||
* @param string $haystack | ||
* @param int $loops | ||
*/ | ||
protected function benchmarkPregMatch(array $needles, string $haystack, int $loops) | ||
{ | ||
print "\nSearching with preg_match...\n"; | ||
|
||
// Note, this actually sucks and misses cases where one needle is a prefix or | ||
// suffix of another. | ||
$regex = '/' . implode('|', $needles) . '/'; | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$k = 0; | ||
while (preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) { | ||
$k = $m[0][1] + 1; | ||
} | ||
} | ||
$et = microtime(1); | ||
print 'time: ' . ($et - $st) . "\n"; | ||
} | ||
|
||
/** | ||
* @param array $needles | ||
* @param string $haystack | ||
* @param int $loops | ||
*/ | ||
protected function benchmarkPregMatchAll(array $needles, string $haystack, int $loops) | ||
{ | ||
print "\nSearching with preg_match_all...\n"; | ||
|
||
// Note, this actually sucks and misses cases where one needle is a prefix or | ||
// suffix of another. | ||
$regex = '/' . implode('|', $needles) . '/'; | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE); | ||
} | ||
$et = microtime(1); | ||
print 'time: ' . ($et - $st) . "\n"; | ||
} | ||
|
||
/** | ||
* @param array $needles | ||
* @param string $haystack | ||
* @param int $loops | ||
* @throws \Exception | ||
*/ | ||
protected function benchmarkAhoCorasick(array $needles, string $haystack, int $loops) | ||
{ | ||
print "\nSearching with aho corasick...\n"; | ||
|
||
$ac = new Search(); | ||
foreach ($needles as $n) { | ||
$ac->addNeedle($n); | ||
} | ||
$ac->finalize(); | ||
|
||
$st = microtime(1); | ||
for ($loop = 0; $loop < $loops; ++$loop) { | ||
$found = $ac->execute($haystack); | ||
} | ||
$et = microtime(1); | ||
print 'time: ' . ($et - $st) . "\n"; | ||
|
||
// Check that the answers match. | ||
// First sort the arrays. | ||
$comp = function ($a, $b) { | ||
return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]); | ||
}; | ||
usort($found, $comp); | ||
usort($this->foundStrpos, $comp); | ||
|
||
if ($this->foundStrpos !== $found) { | ||
print "ERROR - Aho Corasick got the wrong result.\n"; | ||
|
||
print 'strpos size: ' . count($this->foundStrpos) . "\n"; | ||
print 'aho corasick size: ' . count($found) . "\n"; | ||
|
||
$numberFound = count($found); | ||
|
||
for ($i = 0; $i < $numberFound; ++$i) { | ||
if ($this->foundStrpos[$i] !== $found[$i]) { | ||
print "Mismatch $i\n"; | ||
print_r($this->foundStrpos[$i]); | ||
print_r($found[$i]); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Compares the performance of Aho Corasick against strpos, preg_match, and preg_match_all | ||
* | ||
* @param array $needles | ||
* @param string $haystack | ||
* @param int $loops | ||
* @throws \Exception | ||
*/ | ||
public function run(array $needles, string $haystack, int $loops = 10) | ||
{ | ||
print 'Loaded ' . count($needles) . ' keywords to search on a text of ' . strlen($haystack) . " characters.\n"; | ||
|
||
$this->benchmarkStrpos($needles, $haystack, $loops); | ||
$this->benchmarkPregMatch($needles, $haystack, $loops); | ||
$this->benchmarkPregMatchAll($needles, $haystack, $loops); | ||
$this->benchmarkAhoCorasick($needles, $haystack, $loops); | ||
} | ||
} |