인프런 커뮤니티 질문&답변

Co Di님의 프로필 이미지
Co Di

작성한 질문수

크롤링 중 list index out of range 에러 도움 부탁드립니다

작성

·

475

0

제가 작성한 코드는 아닙니다

크롤링 하는 와중에 list index of range 에러가 나오는데 해결법을 못 찾아서 질문드립니다..

 

 

 

 

from urllib.request import urlopen

from bs4 import BeautifulSoup

from xml.dom.pulldom import END_DOCUMENT

import pandas as pd

import requests

from bs4 import  BeautifulSoup

from datetime import datetime

import re

from tqdm import tqdm

from tqdm.contrib.concurrent import process_map

import math

from time import sleep

from multiprocessing.dummy import Pool

import multiprocessing as mp

from multiprocessing.pool import MaybeEncodingError

 

 

 

 

start_date = "y1=2019&m1=09&d1=25"

end_date = "y2=2019&m2=09&d2=30"

url = "https://find.mk.co.kr/new/search.php?pageNum={}&cat=&cat1=&media_eco=&pageSize=10&sub=all&dispFlag=OFF&page=news&s_kwd=%BB%EF%BC%BA%C0%FC%C0%DA&s_page=news&go_page=&ord=1&ord1=1&ord2=0&s_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&period=p_direct&s_i_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&s_author=&{}&{}&ord=1&area=ttbd"

 

def get_list(idx) :

 

    #idx = 검색했을때 page 번호

    req = requests.get(url.format(idx, start_date, end_date))

 

    #한글깨져서 인코딩 

    soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

    div_list = soup.find_all('div', {'class' : 'sub_list'})

    art_list = [i.find('span', {'class': 'art_tit'}) for i in div_list]

 

    #db에 저장할거 title, href, body, date

    df = pd.DataFrame(columns = {'title','href', 'date','body'})

    for article in art_list:

        append_flag = True

 

 

        title = str(article.find("a").contents[0])

        href = str(article.find("a")["href"])

        body_text = None

        date = None

        try:

            req = requests.get(href, timeout=2)

        except requests.exceptions.Timeout as errd:

            print("Timeout Error : ", errd)

        except requests.exceptions.ConnectionError as errc:

            print("Error Connecting : ", errc)

 

        except requests.exceptions.HTTPError as errb:

            print("Http Error : ", errb)

        # Any Error except upper exception

        except requests.exceptions.RequestException as erra:

            print("AnyException : ", erra)

 

        try:

            soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

        except:

            print("parser error")

 

 

        date_text = soup.find('li', {'class' : 'lasttime'})

        if not date_text :

            date_text = soup.find('li', {'class' : 'lasttime1'})

        if date_text :

            match = re.search(r'\d{4}.\d{2}.\d{2}', date_text.string)

            if match :

                date = datetime.strptime(match.group(), '%Y.%m.%d').date()

            else :

                print("match none")

        else :

            append_flag = False

            #print("mssing date text")

 

 

        art_text = soup.find('div', {'class' : 'art_txt'})  

        if not art_text :

            art_text = soup.find('div', {'class' : 'article_body'}) 

        if not art_text :

            art_text = soup.find('div', {'class' : 'view_txt'}) 

        if art_text :

            body_text = art_text.get_text()

        else :

            append_flag = False

            #print("mssing body text")

            #print("link : " + href)

 

        if append_flag : 

            temp = pd.DataFrame({'title' : [ title ], 'href' : [ href ], 'date' : [ date ], 'body' : [body_text]})  

            df = df.append(temp)

 

    return df

 

def get_count() :

    req = requests.get(url.format(1, start_date, end_date))

    #한글깨져서 인코딩

    soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

    count_text = soup.find('span', {'class' : 'class_tit'}).get_text().replace(",","")

    art_count = re.search("\d+",count_text)

    "y1=2019&m1=03&d1=15"

 

    print(start_date[3:7]+"년 "+start_date[11:13]+"월 "+start_date[17:]+"일 부터 "

    +end_date[3:7]+"년 "+end_date[11:13]+"월 "+end_date[17:]+"일 까지 총 "

    +art_count.group(0)+"개의 기사")

 

    return art_count.group(0)

 

if __name__ == "__main__":

 

    count = get_count()

    tasks_count = math.ceil(float(count)/20) + 1

 

    #tasks = range(1,10)

    tasks = range(1,tasks_count)

    result_list = process_map(get_list, tasks,max_workers=4)

    df = pd.concat(result_list)

    #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

 

    print(df)     

    file_name = start_date[5:7]+start_date[11:13]+start_date[17:]+"_"+end_date[5:7]+end_date[11:13]+end_date[17:]

    df.to_csv(file_name+'.csv', index = False, encoding='utf-8-sig')

 

------------------------------------------------------------------------------------------------------------------------------------------------------

코드는 이렇구요

 

_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 175, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in _process_chunk
    return [fn(*args) for args in chunk]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in <listcomp>
    return [fn(*args) for args in chunk]
  File "<ipython-input-7-167ab35f9166>", line 22, in get_list
    title = str(article.find("a").contents[0])
IndexError: list index out of range
"""

The above exception was the direct cause of the following exception:

IndexError                                Traceback (most recent call last)
<ipython-input-7-167ab35f9166> in <module>()
     96   #tasks = range(1,10)
     97   tasks = range(1,tasks_count)
---> 98   result_list = process_map(get_list, tasks,max_workers=4)
     99   df = pd.concat(result_list)
    100   #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in process_map(fn, *iterables, **tqdm_kwargs)
    128         tqdm_kwargs = tqdm_kwargs.copy()
    129         tqdm_kwargs["lock_name"] = "mp_lock"
--> 130     return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs)

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs)
     74             map_args.update(chunksize=chunksize)
     75         with PoolExecutor(**pool_kwargs) as ex:
---> 76             return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
     77 
     78 

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/notebook.py in __iter__(self)
    255     def __iter__(self):
    256         try:
--> 257             for obj in super(tqdm_notebook, self).__iter__():
    258                 # return super(tqdm...) will not catch exception
    259                 yield obj

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/std.py in __iter__(self)
   1183 
   1184         try:
-> 1185             for obj in iterable:
   1186                 yield obj
   1187                 # Update and possibly print the progressbar.

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable)
    364     careful not to keep references to yielded objects.
    365     """
--> 366     for element in iterable:
    367         element.reverse()
    368         while element:

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result_iterator()
    584                     # Careful not to keep a reference to the popped future
    585                     if timeout is None:
--> 586                         yield fs.pop().result()
    587                     else:
    588                         yield fs.pop().result(end_time - time.monotonic())

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    430                 raise CancelledError()
    431             elif self._state == FINISHED:
--> 432                 return self.__get_result()
    433             else:
    434                 raise TimeoutError()

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result


---------------------------------------------------------------------------------------------------------------------

이렇게 에러가 뜹니다





title = str(article.find("a").contents[0]) 이 부분에서
contents가 존재하지 않는데 인덱스로 접근하려고 해서 오류가 난 것 같은데
contents가 무조건 존재 하는게 아니라면 존재하지 않는 경우의 예외처리를 추가하려면 어떻게 해야될까요?
어느 위치에 뭐라고 작성해야 할지 몰라서 막막해서 질문드립니다

답변

답변을 기다리고 있는 질문이에요
첫번째 답변을 남겨보세요!
Co Di님의 프로필 이미지
Co Di

작성한 질문수

질문하기