leevis.com icon indicating copy to clipboard operation
leevis.com copied to clipboard

pcre 库在nginx中的应用

Open vislee opened this issue 8 years ago • 0 comments

概述

pcre是与perl一致的正则表达式,nginx就是用的该库。

系统:Mac OS X pcre版本:version 8.38 2015-11-23 安装路径:

$brew list pcre
/usr/local/Cellar/pcre/8.38/bin/pcre-config
/usr/local/Cellar/pcre/8.38/bin/pcregrep
/usr/local/Cellar/pcre/8.38/bin/pcretest
/usr/local/Cellar/pcre/8.38/include/ (6 files)
/usr/local/Cellar/pcre/8.38/lib/libpcre.1.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcre16.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcre32.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcrecpp.0.dylib
/usr/local/Cellar/pcre/8.38/lib/libpcreposix.0.dylib
/usr/local/Cellar/pcre/8.38/lib/pkgconfig/ (5 files)
/usr/local/Cellar/pcre/8.38/lib/ (10 other files)
/usr/local/Cellar/pcre/8.38/share/doc/ (64 files)
/usr/local/Cellar/pcre/8.38/share/man/ (103 files)

先看个简单的例子,输出pcre版本号:


$cat pretest.c
#define PCRE_STATIC
#include <stdio.h>
#include <pcre.h>

int main() {
    const char *s = pcre_version();
    printf("version %s\n", s);
    return 0;
}

编译&执行:


gcc pretest.c -I /usr/local/Cellar/pcre/8.38/include/ -L /usr/local/Cellar/pcre/8.38/lib/ -lpcre

$./a.out
version 8.38 2015-11-23

pcre api

  • pcre_compile
 pcre *pcre_compile(const char *pattern, int options,
            const char **errptr, int *erroffset,
            const unsigned char *tableptr);

将一个正则表达式编译为一个内部结构,匹配多个字符串时可以加快匹配速度。 参数: pattern: 包含正则表达式的c字符串 options: 0或者其他参数选项 errptr: 返回的错误信息 erroffset: 正则表达式错误偏移 tableptr: 字符数组或空 具体请看man手册。

  • pcre_fullinfo
int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
            int what, void *where);

返回编译好的模式信息。 参数: code: 编译好的模式,pcre_compile的返回值。 extra: pcre_study()的返回值,或NULL what: 要返回什么信息 where: 返回的结果 具体请看man手册

  • pcre_study
pcre_extra *pcre_study(const pcre *code, int options,
            const char **errptr);

对编译好的模式进行学习,提取可以加速匹配的信息 参数: code: 编译好的模式 options: 选项 errptr: 错误信息 具体请看man手册

  • pcre_exec
int pcre_exec(const pcre *code, const pcre_extra *extra,
            const char *subject, int length, int startoffset,
            int options, int *ovector, int ovecsize);

使用编译好的模式进行匹配,采用与Perl相似的算法,返回匹配串的偏移位置 参数: code: 编译好的模式 extra: 指向一个pcre_extra结构体,可以为NULL subject: 需要匹配的字符串 length: 匹配的字符串长度(Byte) startoffset: 匹配的开始位置 options: 选项位 ovector: 指向一个结果的整型数组 ovecsize: 数组大小 具体请看man手册

例子

#define PCRE_STATIC
#include <stdio.h>
#include <string.h>
#include <pcre.h>

int main() {
    const char *err;
    int erroffset;
    const char *s = "<title>Hello World</title>";
    const char *p = "<title>(.*)</title>";
    int infosize;
    int ovector[30]= {0};
    pcre *re = pcre_compile(p, 0, &err, &erroffset, NULL);
    if (re == NULL) {
        printf("compile err: %s %d\n", err, erroffset);
        return 1;
    }
    int n = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &infosize);
    if (n < 0) {
        printf("fullinfo err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    printf("fullinfo res: %d\n", infosize);

    int rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovector, 30);
    if (rc < 0) {
        pcre_free(re);
        printf("pcre_exec %d\n", rc);
        return 1;
    }
    for (int i=0; i<rc; i++) {
        const char *substring_start = s + ovector[2*i];  
        int substring_length = ovector[2*i+1] - ovector[2*i];  
        printf("$%2d: %.*s\n", i, substring_length, substring_start); 
    }
    return 0;
}

运行:

$./a.out
fullinfo res: 111
$ 0: <title>Hello World</title>
$ 1: Hello World

pcre2例子:

#define PCRE2_CODE_UNIT_WIDTH  8
#include <stdio.h>
#include <string.h>
#include <pcre2.h>

int main() {
    int errcode;
    size_t erroff;
    unsigned char ss[] = "<title>Hello World</title>";
    const unsigned char *s = ss;
    unsigned char pp[] = "<title>(.*)</title>";
    const unsigned char *p = pp;
    int infosize;
    pcre2_match_data       *match_data;

    pcre2_code *re = pcre2_compile(p, 19, 0, &errcode, &erroff, NULL);
    if (re == NULL) {
        printf("compile err: %d %zu\n", errcode, erroff);
        return 1;
    }
    int n = pcre2_pattern_info(re, PCRE2_INFO_SIZE, &infosize);
    if (n < 0) {
        printf("fullinfo err: %d\n", n);
        pcre2_code_free(re);
        return 1;
    }
    printf("fullinfo res: %d\n", infosize);

    match_data = pcre2_match_data_create(9 / 3, NULL);
    int rc = pcre2_match(re, s, 26, 0, 0, match_data, NULL);
    if (rc < 0) {
        pcre2_code_free(re);
        printf("pcre2_match %d\n", rc);
        return 1;
    }

    int nc = pcre2_get_ovector_count(match_data);
    size_t *ovector = pcre2_get_ovector_pointer(match_data);

    for (int i=0; i<nc; i++) {
        const unsigned char *substring_start = s + ovector[2*i];  
        int substring_length = ovector[2*i+1] - ovector[2*i];
        if (substring_length == 0) continue;
        printf("$%2d: %.*s\n", i, substring_length, substring_start); 
    }
    return 0;
}

编译&运行:

gcc pcre2.c -I pcre2/10.44/include/ -L pcre2/10.44/lib/ -lpcre2-8.0

./a.out
fullinfo res: 183
$ 0: <title>Hello World</title>
$ 1: Hello World

例子2

#define PCRE_STATIC
#include <stdio.h>
#include <string.h>
#include <pcre.h>

int main() {
    const char *err;
    int erroffset;
    const char *s = "imgxs.liwq.applinzi.com";
    const char *p = "(?<srv>imgxs).(?<app>[a-z]+).(?<suffix>[a-z]+).(.+)";
    int infosize, capturecount, namecount, entrysize;
    int ovector[30]= {0};

    pcre *re = pcre_compile(p, 0, &err, &erroffset, NULL);
    if (re == NULL) {
        printf("compile err: %s in %s %d\n", err, p+erroffset, erroffset);
        return 1;
    }
    int n = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &infosize);
    if (n < 0) {
        printf("fullinfo PCRE_INFO_SIZE err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    printf("fullinfo PCRE_INFO_SIZE res: %d\n", infosize);

    n = pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &capturecount);
    if (n < 0) {
        printf("fullinfo PCRE_INFO_CAPTURECOUNT err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    printf("fullinfo PCRE_INFO_CAPTURECOUNT res: %d\n", capturecount);

    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
    if (n < 0) {
        printf("fullinfo PCRE_INFO_NAMECOUNT err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    printf("fullinfo PCRE_INFO_NAMECOUNT res: %d\n", namecount);


    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize);
    if (n < 0) {
        printf("fullinfo PCRE_INFO_NAMEENTRYSIZE err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    printf("fullinfo PCRE_INFO_NAMEENTRYSIZE res: %d\n", entrysize);

    char *nametable;
    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &nametable);
    if (n < 0) {
        printf("fullinfo PCRE_INFO_NAMETABLE err: %d\n", n);
        pcre_free(re);
        return 1;
    }
    // printf("fullinfo PCRE_INFO_NAMETABLE res: %s\n", nametable);
    char *pt = nametable;
    for (int i=0; i<namecount; i++) {
        int capture = 2 * ((pt[0] << 8) + pt[1]);
        printf("nametable %d %s\n", capture, &pt[2]);
        pt += entrysize;
    }

    int rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovector, 30);
    if (rc < 0) {
        pcre_free(re);
        printf("pcre_exec %d\n", rc);
        return 1;
    }
    for (int i=0; i<rc; i++) {
        const char *substring_start = s + ovector[2*i];  
        int substring_length = ovector[2*i+1] - ovector[2*i];  
        printf("$%2d: %.*s\n", i, substring_length, substring_start); 
    }
    return 0;
}

fullinfo PCRE_INFO_SIZE res: 213
fullinfo PCRE_INFO_CAPTURECOUNT res: 4
fullinfo PCRE_INFO_NAMECOUNT res: 3
fullinfo PCRE_INFO_NAMEENTRYSIZE res: 9
nametable 4 app
nametable 2 srv
nametable 6 suffix
$ 0: imgxs.liwq.applinzi.com
$ 1: imgxs
$ 2: liwq
$ 3: applinzi
$ 4: com

注: PCRE_INFO_NAMEENTRYSIZE 是9个字节,因为字符串"suffix"(0结尾)为7个字节,再加上2个字节的捕获顺序。

nginx regex

nginx 的配置有的指令是支持正则表达式的。在nginx源码中是也是通过加载调用pcre这个库来实现的。 主要的实现在:ngx_regex.h ngx_regex.c这两个文件中。 通过ngx_regex_compile函数封装了pcre的pcre_compile和pcre_fullinfo函数。定义了如下结构:

typedef struct {
    pcre        *code;
    pcre_extra  *extra;
} ngx_regex_t;

typedef struct {
    ngx_str_t     pattern;    /* 正则字符串 */
    ngx_pool_t   *pool;      /* 编译正则表达式从哪分配内存 */
    ngx_int_t     options;    /* pcre_compile 的options ngx目前仅用到PCRE_CASELESS,表示忽略大小写*/

    ngx_regex_t  *regex;  /* regex->code 编译后的结果,即pcre_compile返回 */
    int           captures;    /* pcre_fullinfo PCRE_INFO_CAPTURECOUNT 的值。捕获变量的个数 */
    int            named_captures;  /* 捕获变量设置了别名的个数 */
    int           name_size;  /* 捕获变量结构长度 */
    u_char       *names;  /* 捕获变量别名结构数组。别名下标占2个字节剩下的就是变量的名字。index=2*(x[0]<<8 + x[1])*/
    ngx_str_t     err;
} ngx_regex_compile_t;

封装实现如下:

#define ngx_regex_exec(re, s, captures, size)                                \
    pcre_exec(re->code, re->extra, (const char *) (s)->data, (s)->len, 0, 0, \
              captures, size)

ngx_int_t
ngx_regex_compile(ngx_regex_compile_t *rc)
{
    int               n, erroff;
    char             *p;
    pcre             *re;
    const char       *errstr;
    ngx_regex_elt_t  *elt;

    // 设置编译所需要的内存
    ngx_regex_malloc_init(rc->pool);
    // 编译正则表达式
    re = pcre_compile((const char *) rc->pattern.data, (int) rc->options,
                      &errstr, &erroff, NULL);

    /* ensure that there is no current pool */
    ngx_regex_malloc_done();

    if (re == NULL) {
        if ((size_t) erroff == rc->pattern.len) {
           rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
                              "pcre_compile() failed: %s in \"%V\"",
                               errstr, &rc->pattern)
                      - rc->err.data;

        } else {
           rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
                              "pcre_compile() failed: %s in \"%V\" at \"%s\"",
                               errstr, &rc->pattern, rc->pattern.data + erroff)
                      - rc->err.data;
        }

        return NGX_ERROR;
    }

    rc->regex = ngx_pcalloc(rc->pool, sizeof(ngx_regex_t));
    if (rc->regex == NULL) {
        goto nomem;
    }

    rc->regex->code = re;

    /* do not study at runtime */

    if (ngx_pcre_studies != NULL) {
        elt = ngx_list_push(ngx_pcre_studies);
        if (elt == NULL) {
            goto nomem;
        }

        elt->regex = rc->regex;
        elt->name = rc->pattern.data;
    }

    // 需要捕获结果的个数
    n = pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &rc->captures);
    if (n < 0) {
        p = "pcre_fullinfo(\"%V\", PCRE_INFO_CAPTURECOUNT) failed: %d";
        goto failed;
    }

    if (rc->captures == 0) {
        return NGX_OK;
    }

    // 捕获结果设置别名的个数
    // test/(?<t1>.+)/(.+)/     --->捕获2个变量,设置了别名的是1个
    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &rc->named_captures);
    if (n < 0) {
        p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMECOUNT) failed: %d";
        goto failed;
    }

    if (rc->named_captures == 0) {
        return NGX_OK;
    }

    //  捕获数组每个元素的大小
    //  以0结尾最长的别名的长度+16bit的捕获序号。
    //  例如?<srv> 这个别名的长度为6,而该结果取的是整个编译好的re中的最大的长度。
    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &rc->name_size);
    if (n < 0) {
        p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMEENTRYSIZE) failed: %d";
        goto failed;
    }

    //  指向捕获数组首地址
    n = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &rc->names);
    if (n < 0) {
        p = "pcre_fullinfo(\"%V\", PCRE_INFO_NAMETABLE) failed: %d";
        goto failed;
    }

    return NGX_OK;

failed:

    rc->err.len = ngx_snprintf(rc->err.data, rc->err.len, p, &rc->pattern, n)
                  - rc->err.data;
    return NGX_ERROR;

nomem:

    rc->err.len = ngx_snprintf(rc->err.data, rc->err.len,
                               "regex \"%V\" compilation failed: no memory",
                               &rc->pattern)
                  - rc->err.data;
    return NGX_ERROR;
}

ngx中的使用


typedef struct {
    ngx_uint_t                    capture; // 捕获结果在数组中的下标
    ngx_int_t                     index;  // 变量在数组中的下标
} ngx_http_regex_variable_t;

typedef struct {
    ngx_regex_t                  *regex;       // 包含了正则编译后的结果
    ngx_uint_t                    ncaptures;  // 捕获结果个数
    ngx_http_regex_variable_t    *variables;  //
    ngx_uint_t                    nvariables;  // 设置了别名的捕获个数
    ngx_str_t                     name;    // 正则表达式字符串
} ngx_http_regex_t;


// 编译正则表达式
ngx_http_regex_t *
ngx_http_regex_compile(ngx_conf_t *cf, ngx_regex_compile_t *rc)
{
    u_char                     *p;
    size_t                      size;
    ngx_str_t                   name;
    ngx_uint_t                  i, n;
    ngx_http_variable_t        *v;
    ngx_http_regex_t           *re;
    ngx_http_regex_variable_t  *rv;
    ngx_http_core_main_conf_t  *cmcf;

    rc->pool = cf->pool;

    if (ngx_regex_compile(rc) != NGX_OK) {
        ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, "%V", &rc->err);
        return NULL;
    }

    re = ngx_pcalloc(cf->pool, sizeof(ngx_http_regex_t));
    if (re == NULL) {
        return NULL;
    }

    re->regex = rc->regex;
    re->ncaptures = rc->captures;
    re->name = rc->pattern;

    cmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_core_module);
    cmcf->ncaptures = ngx_max(cmcf->ncaptures, re->ncaptures);

    // 设置了捕获别名的个数,最终会设置为变量
    n = (ngx_uint_t) rc->named_captures;

    if (n == 0) {
        return re;
    }

    rv = ngx_palloc(rc->pool, n * sizeof(ngx_http_regex_variable_t));
    if (rv == NULL) {
        return NULL;
    }

    // 变量
    re->variables = rv;
    re->nvariables = n;

    size = rc->name_size;
    p = rc->names;

    for (i = 0; i < n; i++) {
        // 捕获数组中的下标
        rv[i].capture = 2 * ((p[0] << 8) + p[1]);

        name.data = &p[2];
        name.len = ngx_strlen(name.data);

        v = ngx_http_add_variable(cf, &name, NGX_HTTP_VAR_CHANGEABLE);
        if (v == NULL) {
            return NULL;
        }

        rv[i].index = ngx_http_get_variable_index(cf, &name);
        if (rv[i].index == NGX_ERROR) {
            return NULL;
        }

        v->get_handler = ngx_http_variable_not_found;

        p += size;
    }

    return re;
}


// 用编译好的结果匹配字符串
ngx_int_t
ngx_http_regex_exec(ngx_http_request_t *r, ngx_http_regex_t *re, ngx_str_t *s)
{
    ngx_int_t                   rc, index;
    ngx_uint_t                  i, n, len;
    ngx_http_variable_value_t  *vv;
    ngx_http_core_main_conf_t  *cmcf;

    cmcf = ngx_http_get_module_main_conf(r, ngx_http_core_module);

    // 正则表达式捕获变量个数
    if (re->ncaptures) {
        len = cmcf->ncaptures;   // 所有正则表达式捕获个数的最大值。且是3倍的值,prce库需要。

        if (r->captures == NULL) {
            r->captures = ngx_palloc(r->pool, len * sizeof(int));
            if (r->captures == NULL) {
                return NGX_ERROR;
            }
        }

    } else {
        len = 0;
    }

    // 匹配字符串,捕获的变量添充到r->captures数组。
    rc = ngx_regex_exec(re->regex, s, r->captures, len);

    if (rc == NGX_REGEX_NO_MATCHED) {
        return NGX_DECLINED;
    }

    if (rc < 0) {
        ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
                      ngx_regex_exec_n " failed: %i on \"%V\" using \"%V\"",
                      rc, s, &re->name);
        return NGX_ERROR;
    }

    // 正则表达式变量赋值
    for (i = 0; i < re->nvariables; i++) {

        n = re->variables[i].capture;
        index = re->variables[i].index;
        vv = &r->variables[index];

        vv->len = r->captures[n + 1] - r->captures[n];
        vv->valid = 1;
        vv->no_cacheable = 0;
        vv->not_found = 0;
        vv->data = &s->data[r->captures[n]];

#if (NGX_DEBUG)
        {
        ngx_http_variable_t  *v;

        v = cmcf->variables.elts;

        ngx_log_debug2(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
                       "http regex set $%V to \"%v\"", &v[index].name, vv);
        }
#endif
    }

    // 捕获结果数组下标最大值
    r->ncaptures = rc * 2;
    // 正则匹配的字符串
    r->captures_data = s->data;

    return NGX_OK;
}

vislee avatar Feb 11 '17 03:02 vislee