crawler icon indicating copy to clipboard operation
crawler copied to clipboard

File Match Rule no filtering unwanted result

Open kenkenchow opened this issue 4 years ago • 2 comments

I want to filter out the path that end with search.asp* but no luck. May I know how to fix it with below gopa.yml?

file_match_rule: must: prefix: [] contain: [] suffix: [] should: prefix: [] contain: [] suffix: [] must_not: prefix: - /search.asp - search.asp contain: - /search.asp - search.asp suffix: - /search.asp - search.asp

kenkenchow avatar Jul 18 '19 08:07 kenkenchow

may i got the full config?

medcl avatar Jul 20 '19 15:07 medcl

cluster: name: gopa #node.name: node1

path.data: data path.logs: log path.certs: test/cert

cookie_secret: YOUR-GOPA-SECRET

allow_multi_instance: true max_num_of_instances: 1

tls_enabled: false

logging.level: debug

network: host: 127.0.0.1

cluster.seeds:

  • 127.0.0.1:10000

modules:

  • name: pipeline enabled: true runners:

    • name: checker enabled: true input_queue: check max_go_routine: 5 threshold_in_ms: 0 timeout_in_ms: 5000 default_config: start: joint: init_task enabled: true process: - joint: url_normalization enabled: true parameters: follow_all_domain: false follow_sub_domain: true max_filename_length: 1000 - joint: url_filter enabled: true parameters: url_match_rule: must_not: prefix: - "mailto:" - "data:image/" - "#" - "javascript:" contain: - "crond/run" - "login" - "logout" suffix: [] must: prefix: [] contain: [] suffix: [] should: prefix: [] contain: [] suffix: [] host_match_rule: must: prefix: [] contain: [] suffix: [] should: prefix: [] contain: - xxxxx.xxxx.com suffix: [] must_not: prefix: [] contain: [] suffix: [] file_match_rule: must: prefix: [] contain: [] suffix: [] should: prefix: [] contain: [] suffix: [] must_not: prefix: [] contain: - search - search.asp suffix: [] file_ext_match_rule: should: prefix: [] contain: [] suffix: [] must: prefix: [] contain: [] suffix: [] must_not: contain: [zip, exe, jar, js, css, rar, gz, zip, bmp, jpeg, gif,svg, png, jpg, apk] prefix: [] suffix: [] - joint: filter_check enabled: true parameters: filter_key: check_filter - joint: task_deduplication enabled: true end: joint: save_task enabled: true parameters: is_create: true
    • name: fetch enabled: true input_queue: fetch max_go_routine: 1 threshold_in_ms: 0 timeout_in_ms: 30000 default_config: start: joint: init_task enabled: true process: - joint: url_normalization enabled: true parameters: follow_all_domain: false follow_sub_domain: true max_filename_length: 1000 - joint: ignore_timeout parameters: ignore_timeout_after_count: 500 enabled: true - joint: chrome_fetch enabled: false parameters: chrome_host: http://127.0.0.1:9223 save_screenshot: true compress_enabled: true timeout_in_seconds: 60 screenshot_quality: 50 - joint: fetch enabled: true parameters: timeout_in_seconds: 100 - joint: parse enabled: true parameters: dispatch_links: true save_images: true max_depth: 100 max_breadth: 1 - joint: extract enabled: false parameters: html_block: your_tag_name1: ".tag_class" your_tag_name2: "#tag_id" - joint: html2text parameters: remove_nonscript: true merge_whitespace: false enabled: true - joint: hash enabled: true - joint: content_deduplication enabled: true - joint: update_check_time enabled: true parameters: decelerate_steps: "24h,48h,72h,168h,360h,720h" accelerate_steps: "24h,12h,6h,3h,1h30m,45m,30m,20m,10m" - joint: lang_detect enabled: true - joint : save_snapshot_fs enabled: false - joint : save_snapshot_db enabled: true parameters: compress_enabled: true bucket: "Snapshot" max_revision: 5 - joint: index enabled: true end: joint: save_task enabled: true keep_404: false keep_redirected: false
  • name: cluster enabled: true

  • name: dispatch enabled: true failed_retry: false new_task: true update_task: true max_concurrent_fetch_tasks: 10 fetch_size: 1000 get_host_config_by_url: false

  • name: api enabled: true

  • name: web enabled: true auth: enabled: false oauth_provider: github oauth_authorize_url: https://github.com/login/oauth/authorize oauth_token_url: https://github.com/login/oauth/access_token client_id: 850d747174ace88ce889 client_secret: 3d437b64e06371d6f62769320438d3dfc95a8d8e authorized_admin: - medcl - your_github_login_name

  • name: filter enabled: true

  • name: queue enabled: true

  • name: index enabled: true ui: enabled: true elasticsearch: endpoint: http://localhost:9200 index_prefix: gopa- username: elastic password: changeme

  • name: elastic enabled: true store_enabled: true orm_enabled: true elasticsearch: endpoint: http://localhost:9200 index_prefix: gopa- username: elastic password: changeme

  • name: stats enabled: true

  • name: statsd enabled: false host: 127.0.0.1 port: 8125 namespace: gopa. interval_in_seconds: 1

plugins:

  • name: chrome enabled: true command: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" debug_port: 9223

kenkenchow avatar Aug 01 '19 03:08 kenkenchow