Python/Crawling
[Crawling] 영화 데이터 수집(날짜 설정)
퓨어맨
2022. 5. 19. 09:05
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&date=20220512'
res = req.get(url)
soup = bs(res.text, 'lxml')
title = soup.select('div.tit5 > a') # 길이 43
point = soup.select('td.point') # 길이 43
title_list=[]
point_list=[]
rank_list=[]
for i in range(20220401, 20220431):
url='https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&tg=0&date='+str(i)
res=req.get(url)
soup=bs(res.text,'lxml')
title=soup.select('div.tit5>a')
point=soup.select('td.point')
for j in range(len(title)):
title_list.append(title[j].text.strip())
point_list.append(point[j].text.strip())
rank_list.append(j+1)
dic = {'영화명' : title_list, '평점' : point_list, '순위' : rank_list}
df = pd.DataFrame(dic)
df.set_index('순위', inplace = True)
20220401 ~ 20220431 일자 만큼의 데이터만 수집
숫자로만 입력했을땐 한달밖에 값을 불러오지 못하기 때문에
- Pandas를 활용하여 date 날짜 생성
date = pd.date_range(start = "2022-01-01", end = "2022-05-12")
days = date.strftime('%Y%m%d') # Y : 2022 , y : 22
from tqdm import tqdm_notebook as tn # 실행 진행상황을 볼 수 있는 함수
title_list=[]
point_list=[]
rank_list=[]
for i in tn(days):
url='https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&tg=0&date='+ i
res=req.get(url)
soup=bs(res.text,'lxml')
title=soup.select('div.tit5>a')
point=soup.select('td.point')
for j in range(len(title)):
title_list.append(title[j].text.strip())
point_list.append(point[j].text.strip())
rank_list.append(j+1)
dic = {'영화명' : title_list, '평점' : point_list, '순위' : rank_list}
df = pd.DataFrame(dic)
df.set_index('순위', inplace = True)
"date = pd.date_range(start = "2022-01-01", end = "2022-05-12")"
"days = date.strftime('%Y%m%d')"
20220101 ~ 20220512 까지의 데이터 수집