2014年9月10日 星期三

[C++] 使用 PCRE、RE2 進行 Match all (如同 PHP preg_match_all 效果) @ Ubuntu 14.04

對 PHP 來說:

$ vim t.php
<?php
$data = "..<a href='...'>...</a>.."; //file_get_contents(...);
if (preg_match_all("#<a[^h]*href=['\"]{0,1}([^\"']+)[\"']{0,1}[^>]*>(.*?)</a>#", $data, $matches) )
        print_r($matches);
$ php t.php
Array
(
    [0] => Array
        (
            [0] => <a href='...'>...</a>
        )

    [1] => Array
        (
            [0] => ...
        )

    [2] => Array
        (
            [0] => ...
        )

)


對 PCRE 來說:

$ vim pcre_test.cpp
#include <pcre.h>
#include <iostream>

int main() {

const char *error;
int erroroffset;
pcre *preg_pattern_a_tag = pcre_compile("<a[^h]*href=['\"]{0,1}([^\"']+)[\"']{0,1}[^>]*>(.*?)</a>", PCRE_MULTILINE, &error,  &erroroffset, NULL);

if (!preg_pattern_a_tag) {
std::cout << "ERROR\n";
return -1;
}

std::string raw = "..<a href='...'>...</a>..";

unsigned int offset = 0;
unsigned int len = raw.size();
int matchInfo[3*2] = {0};
int rc = 0;

while (offset < len && (rc = pcre_exec(preg_pattern_a_tag, 0, raw.c_str(), len, offset, 0,  matchInfo, sizeof(matchInfo))) >= 0) {
for (int n=0; n<rc ; ++n) {
int data_length = matchInfo[2*n+1] - matchInfo[2*n];
std::cout << "Found:[" << raw.substr(matchInfo[2*n], data_length) << "]\n";
}
offset = matchInfo[1];
}
return 0;
}
$ g++ -std=c++11 pcre_test.cpp -lpcre
$ ./a.out
Found:[<a href='...'>...</a>]
Found:[...]
Found:[...]


對 RE2 來說:

$ vim re2_test.cpp
#include <re2/re2.h>
#include <iostream>

int main() {

//RE2 preg_pattern_a_tag("<a[^h]*href=['\"]{0,1}([^\"']+)[\"']{0,1}[^>]*>(.*?)</a>", RE2::Latin1);
RE2 preg_pattern_a_tag("<a[^h]*href=['\"]{0,1}([^\"']+)[\"']{0,1}[^>]*>(.*?)</a>");

std::string raw = "..<a href='...'>...</a>..";

re2::StringPiece result_a_href, result_a_body;

while(RE2::PartialMatch(raw, preg_pattern_a_tag, &result_a_href, &result_a_body)) {
std::cout << "result_a_href:[" << result_a_href << "]\n";
std::cout << "result_a_body:[" << result_a_body << "]\n";
raw = result_a_body.data();
}
return 0;
}
$ g++ -std=c++11 re2_test.cpp /path/libre2.a -lpthread
$ ./a.out
result_a_href:[...]
result_a_body:[...]

沒有留言:

張貼留言