python-data-analysis icon indicating copy to clipboard operation
python-data-analysis copied to clipboard

나무위키 크롤링이 안되네요.

Open thebluecloud opened this issue 3 years ago • 0 comments

최근 eBook 을 구매해서 공부를 시작하였습니다. 나무위키 크롤링을 공부하는데 에러가 나더라고요. 아래 부분입니다. contents_table = soup.find(name="table", attrs={"class":"table-hover"})

그래서 찾아보니 나무위키가 보안정책을 변경하여 코드가 수정되었다고 하여, github에 올라온 01-namu-wiki-text-analysis.ipynb 파일을 구동해 보았는데 새로운 코드도 동작하지 않습니다.

아래 소스 및 콘솔로그를 올렸으니 검토 부탁드립니다.


from selenium import webdriver from bs4 import BeautifulSoup import re

brew 로 설치된 chromedriver의 path (Mac)

path = "/usr/local/bin/chromedriver"

윈도우용 크롬 웹드라이버 실행 경로 (Windows)

excutable_path = "chromedriver.exe"

크롤링할 사이트 주소를 정의합니다.

source_url = "https://namu.wiki/RecentChanges"

사이트의 html 구조에 기반하여 크롤링을 수행합니다.

driver = webdriver.Chrome(path) # for Mac

driver = webdriver.Chrome(executable_path=excutable_path) # for Windows driver.get(source_url) req = driver.page_source soup = BeautifulSoup(req, "html.parser") contents_table = soup.find(name="table") table_body = contents_table.find(name="tbody") table_rows = table_body.find_all(name="tr")


SessionNotCreatedException Traceback (most recent call last) C:\Users\mhyun\AppData\Local\Temp\ipykernel_7804\502150854.py in 14 # 사이트의 html 구조에 기반하여 크롤링을 수행합니다. 15 # driver = webdriver.Chrome(path) # for Mac ---> 16 driver = webdriver.Chrome(executable_path=excutable_path) # for Windows 17 driver.get(source_url) 18 req = driver.page_source

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\chrome\webdriver.py in init(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, service, keep_alive) 71 port, options, 72 service_args, desired_capabilities, ---> 73 service_log_path, service, keep_alive)

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\chromium\webdriver.py in init(self, browser_name, vendor_prefix, port, options, service_args, desired_capabilities, service_log_path, service, keep_alive) 97 browser_name=browser_name, vendor_prefix=vendor_prefix, 98 keep_alive=keep_alive, ignore_proxy=_ignore_proxy), ---> 99 options=options) 100 except Exception: 101 self.quit()

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\remote\webdriver.py in init(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options) 266 self.file_detector = file_detector or LocalFileDetector() 267 self.start_client() --> 268 self.start_session(capabilities, browser_profile) 269 270 def repr(self):

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\remote\webdriver.py in start_session(self, capabilities, browser_profile) 357 parameters = {"capabilities": w3c_caps, 358 "desiredCapabilities": capabilities} --> 359 response = self.execute(Command.NEW_SESSION, parameters) 360 if 'sessionId' not in response: 361 response = response['value']

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\remote\webdriver.py in execute(self, driver_command, params) 422 response = self.command_executor.execute(driver_command, params) 423 if response: --> 424 self.error_handler.check_response(response) 425 response['value'] = self._unwrap_value( 426 response.get('value', None))

C:\Users\mhyun\anaconda3\envs\pybook\lib\site-packages\selenium\webdriver\remote\errorhandler.py in check_response(self, response) 245 alert_text = value['alert'].get('text') 246 raise exception_class(message, screen, stacktrace, alert_text) # type: ignore[call-arg] # mypy is not smart enough here --> 247 raise exception_class(message, screen, stacktrace) 248 249 def _value_or_default(self, obj: Mapping[_KT, _VT], key: _KT, default: _VT) -> _VT:

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 98 Current browser version is 97.0.4692.71 with binary path C:\Program Files (x86)\Google\Chrome\Application\chrome.exe Stacktrace: Backtrace: Ordinal0 [0x007AA0A3+2597027] Ordinal0 [0x0073BA61+2144865] Ordinal0 [0x00633C58+1064024] Ordinal0 [0x00653708+1193736] Ordinal0 [0x0064F585+1176965] Ordinal0 [0x0064CD41+1166657] Ordinal0 [0x0067D23F+1364543] Ordinal0 [0x0067CE6A+1363562] Ordinal0 [0x006785C6+1344966] Ordinal0 [0x006554C6+1201350] Ordinal0 [0x006563B6+1205174] GetHandleVerifier [0x009510A2+1681842] GetHandleVerifier [0x00A03B0C+2413596] GetHandleVerifier [0x00841221+568113] GetHandleVerifier [0x0083FFA3+563379] Ordinal0 [0x0074174E+2168654] Ordinal0 [0x00746538+2188600] Ordinal0 [0x00746680+2188928] Ordinal0 [0x0075030C+2229004] BaseThreadInitThunk [0x7794FA29+25] RtlGetAppContainerNamedObjectPath [0x77B37A9E+286] RtlGetAppContainerNamedObjectPath [0x77B37A6E+238]

thebluecloud avatar Jan 21 '22 02:01 thebluecloud