pcre 库在nginx中的应用
概述
pcre是与perl一致的正则表达式,nginx就是用的该库。
系统:Mac OS X pcre版本:version 8.38 2015-11-23 安装路径:
$brew list pcre
/usr/local/Cellar/pcre/8.38/bin/pcre-config
/usr/local/Cellar/pcre/8.38/bin/pcregrep
/usr/local/Cellar/pcre/8.38/bin/pcretest
/usr/local/Cellar/pcre/8.38/include/ (6 files)
/usr/local/Cellar/pcre/8.38/lib/libpcre.1.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcre16.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcre32.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcrecpp.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcreposix.0.dylib
/usr/local/Cellar/pcre/8.38/lib/pkgconfig/ (5 files)
/usr/local/Cellar/pcre/8.38/lib/ (10 other files)
/usr/local/Cellar/pcre/8.38/share/doc/ (64 files)
/usr/local/Cellar/pcre/8.38/share/man/ (103 files)
先看个简单的例子,输出pcre版本号:
$cat pretest.c
#define PCRE_STATIC
#include <stdio.h>
#include <pcre.h>
int main() {
const char *s = pcre_version();
printf("version %s\n", s);
return 0;
}
编译&执行:
gcc pretest.c -I /usr/local/Cellar/pcre/8.38/include/ -L /usr/local/Cellar/pcre/8.38/lib/ -lpcre
$./a.out
version 8.38 2015-11-23
pcre api
- pcre_compile
pcre *pcre_compile(const char *pattern, int options,
const char **errptr, int *erroffset,
const unsigned char *tableptr);
将一个正则表达式编译为一个内部结构,匹配多个字符串时可以加快匹配速度。 参数: pattern: 包含正则表达式的c字符串 options: 0或者其他参数选项 errptr: 返回的错误信息 erroffset: 正则表达式错误偏移 tableptr: 字符数组或空 具体请看man手册。
- pcre_fullinfo
int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
int what, void *where);
返回编译好的模式信息。 参数: code: 编译好的模式,pcre_compile的返回值。 extra: pcre_study()的返回值,或NULL what: 要返回什么信息 where: 返回的结果 具体请看man手册
- pcre_study
pcre_extra *pcre_study(const pcre *code, int options,
const char **errptr);
对编译好的模式进行学习,提取可以加速匹配的信息 参数: code: 编译好的模式 options: 选项 errptr: 错误信息 具体请看man手册
- pcre_exec
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
使用编译好的模式进行匹配,采用与Perl相似的算法,返回匹配串的偏移位置 参数: code: 编译好的模式 extra: 指向一个pcre_extra结构体,可以为NULL subject: 需要匹配的字符串 length: 匹配的字符串长度(Byte) startoffset: 匹配的开始位置 options: 选项位 ovector: 指向一个结果的整型数组 ovecsize: 数组大小 具体请看man手册
例子
#define PCRE_STATIC
#include <stdio.h>
#include <string.h>
#include <pcre.h>
int main() {
const char *err;
int erroffset;
const char *s = "<title>Hello World</title>";
const char *p = "<title>(.*)</title>";
int infosize;
int ovector[30]= {0};
pcre *re = pcre_compile(p, 0, &err, &erroffset, NULL);
if (re == NULL) {
printf("compile err: %s %d\n", err, erroffset);
return 1;
}
int n = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &infosize);
if (n < 0) {
printf("fullinfo err: %d\n", n);
pcre_free(re);
return 1;
}
printf("fullinfo res: %d\n", infosize);
int rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovector, 30);
if (rc < 0) {
pcre_free(re);
printf("pcre_exec %d\n", rc);
return 1;
}
for (int i=0; i<rc; i++) {
const char *substring_start = s + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("$%2d: %.*s\n", i, substring_length, substring_start);
}
return 0;
}
运行:
$./a.out
fullinfo res: 111
$ 0: <title>Hello World</title>
$ 1: Hello World
pcre2例子:
#define PCRE2_CODE_UNIT_WIDTH 8
#include <stdio.h>
#include <string.h>
#include <pcre2.h>
int main() {
int errcode;
size_t erroff;
unsigned char ss[] = "<title>Hello World</title>";
const unsigned char *s = ss;
unsigned char pp[] = "<title>(.*)</title>";
const unsigned char *p = pp;
int infosize;
pcre2_match_data *match_data;
pcre2_code *re = pcre2_compile(p, 19, 0, &errcode, &erroff, NULL);
if (re == NULL) {
printf("compile err: %d %zu\n", errcode, erroff);
return 1;
}
int n = pcre2_pattern_info(re, PCRE2_INFO_SIZE, &infosize);
if (n < 0) {
printf("fullinfo err: %d\n", n);
pcre2_code_free(re);
return 1;
}
printf("fullinfo res: %d\n", infosize);
match_data = pcre2_match_data_create(9 / 3, NULL);
int rc = pcre2_match(re, s, 26, 0, 0, match_data, NULL);
if (rc < 0) {
pcre2_code_free(re);
printf("pcre2_match %d\n", rc);
return 1;
}
int nc = pcre2_get_ovector_count(match_data);
size_t *ovector = pcre2_get_ovector_pointer(match_data);
for (int i=0; i<nc; i++) {
const unsigned char *substring_start = s + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
if (substring_length == 0) continue;
printf("$%2d: %.*s\n", i, substring_length, substring_start);
}
return 0;
}
编译&运行:
gcc pcre2.c -I pcre2/10.44/include/ -L pcre2/10.44/lib/ -lpcre2-8.0
./a.out
fullinfo res: 183
$ 0: <title>Hello World</title>
$ 1: Hello World
例子2
#define PCRE_STATIC
#include <stdio.h>
#include <string.h>
#include <pcre.h>
int main() {
const char *err;
int erroffset;
const char *s = "imgxs.liwq.applinzi.com";
const char *p = "(?<srv>imgxs).(?<app>[a-z]+).(?<suffix>[a-z]+).(.+)";
int infosize, capturecount, namecount, entrysize;
int ovector[30]= {0};
pcre *re = pcre_compile(p, 0, &err, &erroffset, NULL);
if (re == NULL) {
printf("compile err: %s in %s %d\n", err, p+erroffset, erroffset);
return 1;
}
int n = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &infosize);
if (n < 0) {
printf("fullinfo PCRE_INFO_SIZE err: %d\n", n);
pcre_free(re);
return 1;
}
printf("fullinfo PCRE_INFO_SIZE res: %d\n", infosize);
n = pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &capturecount);
if (n < 0) {
printf("fullinfo PCRE_INFO_CAPTURECOUNT err: %d\n", n);
pcre_free(re);
return 1;
}
printf("fullinfo PCRE_INFO_CAPTURECOUNT res: %d\n", capturecount);
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
if (n < 0) {
printf("fullinfo PCRE_INFO_NAMECOUNT err: %d\n", n);
pcre_free(re);
return 1;
}
printf("fullinfo PCRE_INFO_NAMECOUNT res: %d\n", namecount);
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize);
if (n < 0) {
printf("fullinfo PCRE_INFO_NAMEENTRYSIZE err: %d\n", n);
pcre_free(re);
return 1;
}
printf("fullinfo PCRE_INFO_NAMEENTRYSIZE res: %d\n", entrysize);
char *nametable;
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &nametable);
if (n < 0) {
printf("fullinfo PCRE_INFO_NAMETABLE err: %d\n", n);
pcre_free(re);
return 1;
}
// printf("fullinfo PCRE_INFO_NAMETABLE res: %s\n", nametable);
char *pt = nametable;
for (int i=0; i<namecount; i++) {
int capture = 2 * ((pt[0] << 8) + pt[1]);
printf("nametable %d %s\n", capture, &pt[2]);
pt += entrysize;
}
int rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovector, 30);
if (rc < 0) {
pcre_free(re);
printf("pcre_exec %d\n", rc);
return 1;
}
for (int i=0; i<rc; i++) {
const char *substring_start = s + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("$%2d: %.*s\n", i, substring_length, substring_start);
}
return 0;
}
fullinfo PCRE_INFO_SIZE res: 213
fullinfo PCRE_INFO_CAPTURECOUNT res: 4
fullinfo PCRE_INFO_NAMECOUNT res: 3
fullinfo PCRE_INFO_NAMEENTRYSIZE res: 9
nametable 4 app
nametable 2 srv
nametable 6 suffix
$ 0: imgxs.liwq.applinzi.com
$ 1: imgxs
$ 2: liwq
$ 3: applinzi
$ 4: com
注: PCRE_INFO_NAMEENTRYSIZE 是9个字节,因为字符串"suffix"(0结尾)为7个字节,再加上2个字节的捕获顺序。
nginx regex
nginx 的配置有的指令是支持正则表达式的。在nginx源码中是也是通过加载调用pcre这个库来实现的。 主要的实现在:ngx_regex.h ngx_regex.c这两个文件中。 通过ngx_regex_compile函数封装了pcre的pcre_compile和pcre_fullinfo函数。定义了如下结构:
typedef struct {
pcre *code;
pcre_extra *extra;
} ngx_regex_t;
typedef struct {
ngx_str_t pattern; /* 正则字符串 */
ngx_pool_t *pool; /* 编译正则表达式从哪分配内存 */
ngx_int_t options; /* pcre_compile 的options ngx目前仅用到PCRE_CASELESS,表示忽略大小写*/
ngx_regex_t *regex; /* regex->code 编译后的结果,即pcre_compile返回 */
int captures; /* pcre_fullinfo PCRE_INFO_CAPTURECOUNT 的值。捕获变量的个数 */
int named_captures; /* 捕获变量设置了别名的个数 */
int name_size; /* 捕获变量结构长度 */
u_char *names; /* 捕获变量别名结构数组。别名下标占2个字节剩下的就是变量的名字。index=2*(x[0]<<8 + x[1])*/
ngx_str_t err;
} ngx_regex_compile_t;
封装实现如下:
#define ngx_regex_exec(re, s, captures, size) \
pcre_exec(re->code, re->extra, (const char *) (s)->data, (s)->len, 0, 0, \
captures, size)
ngx_int_t
ngx_regex_compile(ngx_regex_compile_t *rc)
{
int n, erroff;
char *p;
pcre *re;
const char *errstr;
ngx_regex_elt_t *elt;
// 设置编译所需要的内存
ngx_regex_malloc_init(rc->pool);
// 编译正则表达式
re = pcre_compile((const char *) rc->pattern.data, (int) rc->options,
&errstr, &erroff, NULL);
/* ensure that there is no current pool */
ngx_regex_malloc_done();
if (re == NULL) {
if ((size_t) erroff == rc->pattern.len) {
rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
"pcre_compile() failed: %s in \"%V\"",
errstr, &rc->pattern)
- rc->err.data;
} else {
rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
"pcre_compile() failed: %s in \"%V\" at \"%s\"",
errstr, &rc->pattern, rc->pattern.data + erroff)
- rc->err.data;
}
return NGX_ERROR;
}
rc->regex = ngx_pcalloc(rc->pool, sizeof(ngx_regex_t));
if (rc->regex == NULL) {
goto nomem;
}
rc->regex->code = re;
/* do not study at runtime */
if (ngx_pcre_studies != NULL) {
elt = ngx_list_push(ngx_pcre_studies);
if (elt == NULL) {
goto nomem;
}
elt->regex = rc->regex;
elt->name = rc->pattern.data;
}
// 需要捕获结果的个数
n = pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &rc->captures);
if (n < 0) {
p = "pcre_fullinfo(\"%V\", PCRE_INFO_CAPTURECOUNT) failed: %d";
goto failed;
}
if (rc->captures == 0) {
return NGX_OK;
}
// 捕获结果设置别名的个数
// test/(?<t1>.+)/(.+)/ --->捕获2个变量,设置了别名的是1个
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &rc->named_captures);
if (n < 0) {
p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMECOUNT) failed: %d";
goto failed;
}
if (rc->named_captures == 0) {
return NGX_OK;
}
// 捕获数组每个元素的大小
// 以0结尾最长的别名的长度+16bit的捕获序号。
// 例如?<srv> 这个别名的长度为6,而该结果取的是整个编译好的re中的最大的长度。
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &rc->name_size);
if (n < 0) {
p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMEENTRYSIZE) failed: %d";
goto failed;
}
// 指向捕获数组首地址
n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &rc->names);
if (n < 0) {
p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMETABLE) failed: %d";
goto failed;
}
return NGX_OK;
failed:
rc->err.len = ngx_snprintf(rc->err.data, rc->err.len, p, &rc->pattern, n)
- rc->err.data;
return NGX_ERROR;
nomem:
rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
"regex \"%V\" compilation failed: no memory",
&rc->pattern)
- rc->err.data;
return NGX_ERROR;
}
ngx中的使用
typedef struct {
ngx_uint_t capture; // 捕获结果在数组中的下标
ngx_int_t index; // 变量在数组中的下标
} ngx_http_regex_variable_t;
typedef struct {
ngx_regex_t *regex; // 包含了正则编译后的结果
ngx_uint_t ncaptures; // 捕获结果个数
ngx_http_regex_variable_t *variables; //
ngx_uint_t nvariables; // 设置了别名的捕获个数
ngx_str_t name; // 正则表达式字符串
} ngx_http_regex_t;
// 编译正则表达式
ngx_http_regex_t *
ngx_http_regex_compile(ngx_conf_t *cf, ngx_regex_compile_t *rc)
{
u_char *p;
size_t size;
ngx_str_t name;
ngx_uint_t i, n;
ngx_http_variable_t *v;
ngx_http_regex_t *re;
ngx_http_regex_variable_t *rv;
ngx_http_core_main_conf_t *cmcf;
rc->pool = cf->pool;
if (ngx_regex_compile(rc) != NGX_OK) {
ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, "%V", &rc->err);
return NULL;
}
re = ngx_pcalloc(cf->pool, sizeof(ngx_http_regex_t));
if (re == NULL) {
return NULL;
}
re->regex = rc->regex;
re->ncaptures = rc->captures;
re->name = rc->pattern;
cmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_core_module);
cmcf->ncaptures = ngx_max(cmcf->ncaptures, re->ncaptures);
// 设置了捕获别名的个数,最终会设置为变量
n = (ngx_uint_t) rc->named_captures;
if (n == 0) {
return re;
}
rv = ngx_palloc(rc->pool, n * sizeof(ngx_http_regex_variable_t));
if (rv == NULL) {
return NULL;
}
// 变量
re->variables = rv;
re->nvariables = n;
size = rc->name_size;
p = rc->names;
for (i = 0; i < n; i++) {
// 捕获数组中的下标
rv[i].capture = 2 * ((p[0] << 8) + p[1]);
name.data = &p[2];
name.len = ngx_strlen(name.data);
v = ngx_http_add_variable(cf, &name, NGX_HTTP_VAR_CHANGEABLE);
if (v == NULL) {
return NULL;
}
rv[i].index = ngx_http_get_variable_index(cf, &name);
if (rv[i].index == NGX_ERROR) {
return NULL;
}
v->get_handler = ngx_http_variable_not_found;
p += size;
}
return re;
}
// 用编译好的结果匹配字符串
ngx_int_t
ngx_http_regex_exec(ngx_http_request_t *r, ngx_http_regex_t *re, ngx_str_t *s)
{
ngx_int_t rc, index;
ngx_uint_t i, n, len;
ngx_http_variable_value_t *vv;
ngx_http_core_main_conf_t *cmcf;
cmcf = ngx_http_get_module_main_conf(r, ngx_http_core_module);
// 正则表达式捕获变量个数
if (re->ncaptures) {
len = cmcf->ncaptures; // 所有正则表达式捕获个数的最大值。且是3倍的值,prce库需要。
if (r->captures == NULL) {
r->captures = ngx_palloc(r->pool, len * sizeof(int));
if (r->captures == NULL) {
return NGX_ERROR;
}
}
} else {
len = 0;
}
// 匹配字符串,捕获的变量添充到r->captures数组。
rc = ngx_regex_exec(re->regex, s, r->captures, len);
if (rc == NGX_REGEX_NO_MATCHED) {
return NGX_DECLINED;
}
if (rc < 0) {
ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
ngx_regex_exec_n " failed: %i on \"%V\" using \"%V\"",
rc, s, &re->name);
return NGX_ERROR;
}
// 正则表达式变量赋值
for (i = 0; i < re->nvariables; i++) {
n = re->variables[i].capture;
index = re->variables[i].index;
vv = &r->variables[index];
vv->len = r->captures[n + 1] - r->captures[n];
vv->valid = 1;
vv->no_cacheable = 0;
vv->not_found = 0;
vv->data = &s->data[r->captures[n]];
#if (NGX_DEBUG)
{
ngx_http_variable_t *v;
v = cmcf->variables.elts;
ngx_log_debug2(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
"http regex set $%V to \"%v\"", &v[index].name, vv);
}
#endif
}
// 捕获结果数组下标最大值
r->ncaptures = rc * 2;
// 正则匹配的字符串
r->captures_data = s->data;
return NGX_OK;
}