작성
·
476
0
제가 작성한 코드는 아닙니다
크롤링 하는 와중에 list index of range 에러가 나오는데 해결법을 못 찾아서 질문드립니다..
from urllib.request import urlopen
from bs4 import BeautifulSoup
from xml.dom.pulldom import END_DOCUMENT
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import math
from time import sleep
from multiprocessing.dummy import Pool
import multiprocessing as mp
from multiprocessing.pool import MaybeEncodingError
start_date = "y1=2019&m1=09&d1=25"
end_date = "y2=2019&m2=09&d2=30"
url = "https://find.mk.co.kr/new/search.php?pageNum={}&cat=&cat1=&media_eco=&pageSize=10&sub=all&dispFlag=OFF&page=news&s_kwd=%BB%EF%BC%BA%C0%FC%C0%DA&s_page=news&go_page=&ord=1&ord1=1&ord2=0&s_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&period=p_direct&s_i_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&s_author=&{}&{}&ord=1&area=ttbd"
def get_list(idx) :
#idx = 검색했을때 page 번호
req = requests.get(url.format(idx, start_date, end_date))
#한글깨져서 인코딩
soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')
div_list = soup.find_all('div', {'class' : 'sub_list'})
art_list = [i.find('span', {'class': 'art_tit'}) for i in div_list]
#db에 저장할거 title, href, body, date
df = pd.DataFrame(columns = {'title','href', 'date','body'})
for article in art_list:
append_flag = True
title = str(article.find("a").contents[0])
href = str(article.find("a")["href"])
body_text = None
date = None
try:
req = requests.get(href, timeout=2)
except requests.exceptions.Timeout as errd:
print("Timeout Error : ", errd)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting : ", errc)
except requests.exceptions.HTTPError as errb:
print("Http Error : ", errb)
# Any Error except upper exception
except requests.exceptions.RequestException as erra:
print("AnyException : ", erra)
try:
soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')
except:
print("parser error")
date_text = soup.find('li', {'class' : 'lasttime'})
if not date_text :
date_text = soup.find('li', {'class' : 'lasttime1'})
if date_text :
match = re.search(r'\d{4}.\d{2}.\d{2}', date_text.string)
if match :
date = datetime.strptime(match.group(), '%Y.%m.%d').date()
else :
print("match none")
else :
append_flag = False
#print("mssing date text")
art_text = soup.find('div', {'class' : 'art_txt'})
if not art_text :
art_text = soup.find('div', {'class' : 'article_body'})
if not art_text :
art_text = soup.find('div', {'class' : 'view_txt'})
if art_text :
body_text = art_text.get_text()
else :
append_flag = False
#print("mssing body text")
#print("link : " + href)
if append_flag :
temp = pd.DataFrame({'title' : [ title ], 'href' : [ href ], 'date' : [ date ], 'body' : [body_text]})
df = df.append(temp)
return df
def get_count() :
req = requests.get(url.format(1, start_date, end_date))
#한글깨져서 인코딩
soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')
count_text = soup.find('span', {'class' : 'class_tit'}).get_text().replace(",","")
art_count = re.search("\d+",count_text)
"y1=2019&m1=03&d1=15"
print(start_date[3:7]+"년 "+start_date[11:13]+"월 "+start_date[17:]+"일 부터 "
+end_date[3:7]+"년 "+end_date[11:13]+"월 "+end_date[17:]+"일 까지 총 "
+art_count.group(0)+"개의 기사")
return art_count.group(0)
if __name__ == "__main__":
count = get_count()
tasks_count = math.ceil(float(count)/20) + 1
#tasks = range(1,10)
tasks = range(1,tasks_count)
result_list = process_map(get_list, tasks,max_workers=4)
df = pd.concat(result_list)
#df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))
print(df)
file_name = start_date[5:7]+start_date[11:13]+start_date[17:]+"_"+end_date[5:7]+end_date[11:13]+end_date[17:]
df.to_csv(file_name+'.csv', index = False, encoding='utf-8-sig')
------------------------------------------------------------------------------------------------------------------------------------------------------
코드는 이렇구요
_RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 175, in _process_worker r = call_item.fn(*call_item.args, **call_item.kwargs) File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in _process_chunk return [fn(*args) for args in chunk] File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in <listcomp> return [fn(*args) for args in chunk] File "<ipython-input-7-167ab35f9166>", line 22, in get_list title = str(article.find("a").contents[0]) IndexError: list index out of range """ The above exception was the direct cause of the following exception: IndexError Traceback (most recent call last) <ipython-input-7-167ab35f9166> in <module>() 96 #tasks = range(1,10) 97 tasks = range(1,tasks_count) ---> 98 result_list = process_map(get_list, tasks,max_workers=4) 99 df = pd.concat(result_list) 100 #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4)) /home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in process_map(fn, *iterables, **tqdm_kwargs) 128 tqdm_kwargs = tqdm_kwargs.copy() 129 tqdm_kwargs["lock_name"] = "mp_lock" --> 130 return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs) /home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs) 74 map_args.update(chunksize=chunksize) 75 with PoolExecutor(**pool_kwargs) as ex: ---> 76 return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs)) 77 78 /home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/notebook.py in __iter__(self) 255 def __iter__(self): 256 try: --> 257 for obj in super(tqdm_notebook, self).__iter__(): 258 # return super(tqdm...) will not catch exception 259 yield obj /home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/std.py in __iter__(self) 1183 1184 try: -> 1185 for obj in iterable: 1186 yield obj 1187 # Update and possibly print the progressbar. /home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable) 364 careful not to keep references to yielded objects. 365 """ --> 366 for element in iterable: 367 element.reverse() 368 while element: /home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result_iterator() 584 # Careful not to keep a reference to the popped future 585 if timeout is None: --> 586 yield fs.pop().result() 587 else: 588 yield fs.pop().result(end_time - time.monotonic()) /home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result(self, timeout) 430 raise CancelledError() 431 elif self._state == FINISHED: --> 432 return self.__get_result() 433 else: 434 raise TimeoutError() /home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result
---------------------------------------------------------------------------------------------------------------------
이렇게 에러가 뜹니다
title = str(article.find("a").contents[0]) 이 부분에서
contents가 존재하지 않는데 인덱스로 접근하려고 해서 오류가 난 것 같은데
contents가 무조건 존재 하는게 아니라면 존재하지 않는 경우의 예외처리를 추가하려면 어떻게 해야될까요?
어느 위치에 뭐라고 작성해야 할지 몰라서 막막해서 질문드립니다
답변