파이썬 문제 도움 부탁드립니다.

Question

네이버 뉴스에서 주어진 키워드에 해당하는 뉴스 기사의 타이틀을 실시간으로 크롤링하여 워드 클라우드로 시각화 하세요. (l  크롤링 키워드 : 신종 코로나)>>빈칸에 있는것만 채우면 되는데 3,4,5번에서 막혔습니다... 부탁드립니다. !!from bs4 import BeautifulSoupimport requestsfrom konlpy.tag import Oktfrom collections import Counterfrom wordcloud import WordCloudimport matplotlib.pyplot as pltimport pandas as pd# 키워드 검색 뉴스 가사 타이틀 가져오기def getnewstitles(startnum, endnum):while True :if startnum > endnum:breakurl = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}&start={}'.format(searchword, startnum)response = requests.get(url)html = response.text1. soup = BeautifulSoup(source, 'html.parser')# 뉴스 타이틀 가져오기2. titles = soup.select ('ul.type01 > li > dl > dt > a')# 뉴스 타이틀을 list에 저장for title in titles:title_list.append(title['title'])start_num += 10 # 읽어올 기사 수 조정# 수집한 기사 타이틀 출력for no, title in enumerate(title_list, start=1) :print(no, title)print("-"* 120)# 워드 클라우드 그리기def make_wordcloud(wordcount):okt = Okt()sentences_tag = []# 형태소로 분석하여 리스트에 넣기for sentence in title_list:morph = okt.pos(sentence)sentences_tag.append(morph)nounadjlist = []# 형태소 중 명사와 형용사만 리스트에 넣기for sentence in sentences_tag:for word, tag in sentence:if tag in ['Noun', 'Adjective']:nounadjlist.append(word)# 단어 빈도수 세기count = Counter(nounadjlist)# 빈도수가 높은 단어 50개 추출, 글자 길이가 2이상인 단어만 추출wordInfo = dict()for tags, counts in count.most_common(wordcount):if len(wordInfo) >= 50 : # 빈도수가 높은 단어 50개 추출breakif (len(str(tags)) > 1): # 글자 길이가 2이상인 단어만 추출wordInfo[tags] = countsprint ("%s : %d" % (tags, counts))# wordcloud 객체 생성(한글깨지는 문제 해결하기위해 font_path 지정)font_path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'wc = WordCloud(fontpath = fontpath,background_color='white', width=600, height=480)# wordcloud 객체에 데이터 매핑wc.generatefromfrequencies(wordInfo)# wordcloud 그리기plt.figure(figsize=(10, 6))plt.axis('off')plt.imshow(wc)plt.show()# 메인 프로그램if name == 'main':3. search_word = # 키워드 검색title_list = []# 뉴스 기사 타이틀 크롤링(시작 ~ 끝)4. (1, 50)# 워드클라우드 그리기5. (100)

초록푸들257 · Accepted Answer

기존 코드를 유지하는 조건에서 보완하였습니다.코드동작의 확인이 필요하시면 Colab에서 확인 가능합니다.(폰트를 지정하지 않아, 글자가 깨질수는 있습니다. 이 점만 로컬에서 수정하시면 됩니다.) 시간이 될때, 변수&함수명의 비일관성과 오타 및 변수scrop은 추가로 리펙토링하면 좋을 것 같아요:)from bs4 import BeautifulSoup
import requests
from konlpy.tag import Okt
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd

def getnewstitles(start_num, end_num):
  url = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}&start={}'.format(search_word, start_num)
  response = requests.get(url)
  html = response.text
  soup = BeautifulSoup(html, 'html.parser')
  titles = soup.select ('ul.type01 > li > dl > dt > a')
  for title in titles:
    title_list.append(title['title'])
    start_num += 10 # 읽어올 기사 수 조정

  for no, title in enumerate(title_list, start=1) :
    print(no, title)
    print("-"* 120)

def make_wordcloud(wordcount):
  okt = Okt()
  sentences_tag = []
  for sentence in title_list:
    morph = okt.pos(sentence)
    sentences_tag.append(morph)
    nounadjlist = []
  
  for sentence in sentences_tag:
    for word, tag in sentence:
      if tag in ['Noun', 'Adjective']:
        nounadjlist.append(word)

  count = Counter(nounadjlist)
  wordInfo = dict()

  for tags, counts in count.most_common(wordcount):
    if len(wordInfo) >= 50 : # 빈도수가 높은 단어 50개 추출
      break
    if (len(str(tags)) > 1): # 글자 길이가 2이상인 단어만 추출
      wordInfo[tags] = counts
      print ("%s : %d" % (tags, counts))
    
    # 실제사용할땐 아래 2줄의 소스로 변경하세요.
    # font_path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
    # wc = WordCloud(fontpath = fontpath,background_color='white', width=600, height=480)
    
    # 실제 사용에서는 아래 1줄을 주석처리하세요.
    wc = WordCloud(background_color='white', width=600, height=480)

  # wordcloud 객체에 데이터 매핑
    wc.generate_from_frequencies(wordInfo)
  # wordcloud 그리기
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.imshow(wc)
    plt.show()

if __name__ == "__main__":
  search_word = "신종 코로나"
  title_list = []
# 뉴스 기사 타이틀 크롤링(시작 ~ 끝)
  getnewstitles(1, 50)
# 워드클라우드 그리기
  make_wordcloud(100)막히는 부분이 있다면 말씀주세요. 함께 고민해보겠습니다.

생활꿀팁

생활꿀팁

파이썬 문제 도움 부탁드립니다.