크롤링 중 list index out of range 에러 도움... - 인프런

제가 작성한 코드는 아닙니다

크롤링 하는 와중에 list index of range 에러가 나오는데 해결법을 못 찾아서 질문드립니다..

from urllib.request import urlopen

from bs4 import BeautifulSoup

from xml.dom.pulldom import END_DOCUMENT

import pandas as pd

import requests

from bs4 import BeautifulSoup

from datetime import datetime

import re

from tqdm import tqdm

from tqdm.contrib.concurrent import process_map

import math

from time import sleep

from multiprocessing.dummy import Pool

import multiprocessing as mp

from multiprocessing.pool import MaybeEncodingError

start_date = "y1=2019&m1=09&d1=25"

end_date = "y2=2019&m2=09&d2=30"

url = "https://find.mk.co.kr/new/search.php?pageNum={}&cat=&cat1=&media_eco=&pageSize=10&sub=all&dispFlag=OFF&page=news&s_kwd=%BB%EF%BC%BA%C0%FC%C0%DA&s_page=news&go_page=&ord=1&ord1=1&ord2=0&s_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&period=p_direct&s_i_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&s_author=&{}&{}&ord=1&area=ttbd"

def get_list(idx) :

#idx = 검색했을때 page 번호

req = requests.get(url.format(idx, start_date, end_date))

#한글깨져서 인코딩

soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

div_list = soup.find_all('div', {'class' : 'sub_list'})

art_list = [i.find('span', {'class': 'art_tit'}) for i in div_list]

#db에 저장할거 title, href, body, date

df = pd.DataFrame(columns = {'title','href', 'date','body'})

for article in art_list:

append_flag = True

title = str(article.find("a").contents[0])

href = str(article.find("a")["href"])

body_text = None

date = None

try:

req = requests.get(href, timeout=2)

except requests.exceptions.Timeout as errd:

print("Timeout Error : ", errd)

except requests.exceptions.ConnectionError as errc:

print("Error Connecting : ", errc)

except requests.exceptions.HTTPError as errb:

print("Http Error : ", errb)

# Any Error except upper exception

except requests.exceptions.RequestException as erra:

print("AnyException : ", erra)

try:

soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

except:

print("parser error")

date_text = soup.find('li', {'class' : 'lasttime'})

if not date_text :

date_text = soup.find('li', {'class' : 'lasttime1'})

if date_text :

match = re.search(r'\d{4}.\d{2}.\d{2}', date_text.string)

if match :

date = datetime.strptime(match.group(), '%Y.%m.%d').date()

else :

print("match none")

else :

append_flag = False

#print("mssing date text")

art_text = soup.find('div', {'class' : 'art_txt'})

if not art_text :

art_text = soup.find('div', {'class' : 'article_body'})

if not art_text :

art_text = soup.find('div', {'class' : 'view_txt'})

if art_text :

body_text = art_text.get_text()

else :

append_flag = False

#print("mssing body text")

#print("link : " + href)

if append_flag :

temp = pd.DataFrame({'title' : [ title ], 'href' : [ href ], 'date' : [ date ], 'body' : [body_text]})

df = df.append(temp)

return df

def get_count() :

req = requests.get(url.format(1, start_date, end_date))

#한글깨져서 인코딩

soup = BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

count_text = soup.find('span', {'class' : 'class_tit'}).get_text().replace(",","")

art_count = re.search("\d+",count_text)

"y1=2019&m1=03&d1=15"

print(start_date[3:7]+"년 "+start_date[11:13]+"월 "+start_date[17:]+"일 부터 "

+end_date[3:7]+"년 "+end_date[11:13]+"월 "+end_date[17:]+"일 까지 총 "

+art_count.group(0)+"개의 기사")

return art_count.group(0)

if __name__ == "__main__":

count = get_count()

tasks_count = math.ceil(float(count)/20) + 1

#tasks = range(1,10)

tasks = range(1,tasks_count)

result_list = process_map(get_list, tasks,max_workers=4)

df = pd.concat(result_list)

#df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

print(df)

file_name = start_date[5:7]+start_date[11:13]+start_date[17:]+"_"+end_date[5:7]+end_date[11:13]+end_date[17:]

df.to_csv(file_name+'.csv', index = False, encoding='utf-8-sig')

------------------------------------------------------------------------------------------------------------------------------------------------------

코드는 이렇구요

_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 175, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in _process_chunk
    return [fn(*args) for args in chunk]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in <listcomp>
    return [fn(*args) for args in chunk]
  File "<ipython-input-7-167ab35f9166>", line 22, in get_list
    title = str(article.find("a").contents[0])
IndexError: list index out of range
"""

The above exception was the direct cause of the following exception:

IndexError                                Traceback (most recent call last)
<ipython-input-7-167ab35f9166> in <module>()
     96   #tasks = range(1,10)
     97   tasks = range(1,tasks_count)
---> 98   result_list = process_map(get_list, tasks,max_workers=4)
     99   df = pd.concat(result_list)
    100   #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in process_map(fn, *iterables, **tqdm_kwargs)
    128         tqdm_kwargs = tqdm_kwargs.copy()
    129         tqdm_kwargs["lock_name"] = "mp_lock"
--> 130     return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs)

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs)
     74             map_args.update(chunksize=chunksize)
     75         with PoolExecutor(**pool_kwargs) as ex:
---> 76             return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
     77 
     78 

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/notebook.py in __iter__(self)
    255     def __iter__(self):
    256         try:
--> 257             for obj in super(tqdm_notebook, self).__iter__():
    258                 # return super(tqdm...) will not catch exception
    259                 yield obj

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/std.py in __iter__(self)
   1183 
   1184         try:
-> 1185             for obj in iterable:
   1186                 yield obj
   1187                 # Update and possibly print the progressbar.

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable)
    364     careful not to keep references to yielded objects.
    365     """
--> 366     for element in iterable:
    367         element.reverse()
    368         while element:

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result_iterator()
    584                     # Careful not to keep a reference to the popped future
    585                     if timeout is None:
--> 586                         yield fs.pop().result()
    587                     else:
    588                         yield fs.pop().result(end_time - time.monotonic())

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    430                 raise CancelledError()
    431             elif self._state == FINISHED:
--> 432                 return self.__get_result()
    433             else:
    434                 raise TimeoutError()

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result


---------------------------------------------------------------------------------------------------------------------

이렇게 에러가 뜹니다

title = str(article.find("a").contents[0]) 이 부분에서

contents가 존재하지 않는데 인덱스로 접근하려고 해서 오류가 난 것 같은데

contents가 무조건 존재 하는게 아니라면 존재하지 않는 경우의 예외처리를 추가하려면 어떻게 해야될까요?
어느 위치에 뭐라고 작성해야 할지 몰라서 막막해서 질문드립니다

인프런 커뮤니티 질문&답변

크롤링 중 list index out of range 에러 도움 부탁드립니다