Python: 연습 4 (크롤링 심화)

Python

Python: 연습 4 (크롤링 심화)

이지파이 2025. 4. 29. 17:52

크롤링 심화

오늘 하려는 것은 수자원공사의 오늘자 뉴스 기사들을 크롤링하는 것이다.

이미 사내 내부망에서는 당일 조간 기사들이 매일 업로드 되고 있다.

매일 뉴스를 일일이 다 확인할 수 없는 직장인분들은 사내 내부망을 통해 뉴스 기사를 확인한다.

또한, 이 기사는 외부업체가 직접 조사하는 것으로 알고 있다.

(+ 크롤링하는 것이 아닌 직접 기사를 수집하는 것으로 알고 있다.)

내가 Python을 통해 오늘자 수자원공사 기사들을 크롤링해보고 그것이 외부업체가 직접 조사한 기사와 얼마나 잘맞는지 확인해 보고 싶었다.

이전 글에서 연습했던 크롤링은 네이버뉴스의 경제면의 금융카테고리의 기사를 찾아달라고 단순히 지정하면 됐었다. (즉, 경로를 정확히 내가 지정할 수 있었다.)

지금 하고 싶은 작업은 다음과 같은 작업이다.

네이버 -> '수자원공사' 검색 -> 뉴스 탭

당일 올라온 '수자원공사' 관련 기사 수집

1. python코드: 코드를 실행하면 네이버에 자동으로 '수자원공사'를 검색해 뉴스면으로 들어가준다. 이후, 당일 수자원공사 관련 기사를 크롤링해준다.

import os
import time
import urllib3
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# SSL 인증 경고 무시
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.environ['WDM_SSL_VERIFY'] = '0'

# 오늘 날짜
today = datetime.today().strftime('%Y.%m.%d')
print(f"✅ 오늘 날짜: {today}")

# 크롬 드라이버 옵션 설정
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

# 드라이버 초기화
try:
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
except Exception as e:
    print(f"드라이버 초기화 실패: {e}")
    exit(1)

try:
    # 키워드 설정: 여기서 '수자원공사' 대신 원하는 키워드를 설정하세요.
    search_keyword = "수자원공사"  # 기본 키워드
    # 예시: 다른 키워드를 사용하고 싶다면 위의 값을 바꿔주세요.
    # 예: search_keyword = "환경 데이터"

    # 네이버 뉴스 검색 페이지 접속
    driver.get(f'https://search.naver.com/search.naver?where=news&sm=tab_opt&query={search_keyword}&sort=0&photo=0&field=0&pd=3') # pd=3: 1일
    print(f"✅ '{search_keyword}'에 대한 네이버 오늘자 뉴스 검색 페이지 접속 완료") # 메시지 변경

    # 동적 콘텐츠 로드 대기 (최대 15초)
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a.news_tit, div[class*='news'], ul[class*='news']"))
    )
    print("뉴스 콘텐츠 로드 완료")

    # 디버깅용 스크린샷
    driver.save_screenshot("news_page.png")

    # 페이지 소스 저장
    page_source = driver.page_source
    with open("page_source.html", "w", encoding="utf-8") as f:
        f.write(page_source)
    print("페이지 소스를 'page_source.html'에 저장했습니다.")

    # BeautifulSoup으로 파싱
    soup = BeautifulSoup(page_source, 'html.parser')

    # 모든 클래스 디버깅
    all_elements = soup.find_all(class_=True)
    classes = set()
    for element in all_elements:
        for class_name in element.get('class', []):
            classes.add(class_name)
    print(f"페이지에서 발견된 클래스 수: {len(classes)}")
    print("주요 클래스: ", ", ".join(sorted(list(classes))[:20]))

    # 뉴스 제목 추출
    news_titles = []
    date_classes = ['info', 'date'] # 잠재적인 날짜 클래스

    # 1. BeautifulSoup으로 제목 및 날짜 추출
    potential_title_classes = [
        'news_tit', 'title', 'api_txt_lines', 'tit', 'head', 'headline',
        'news-tit', 'news_headline', 'article_title'
    ]

    for class_name in potential_title_classes:
        print(f"'{class_name}' 클래스 탐색...")
        elements = soup.find_all(class_=class_name)
        for elem in elements:
            if elem.name == 'a' and elem.text.strip():
                date_element = elem.find_next(class_=lambda c: c if c in date_classes else None)
                date_text = date_element.text.strip() if date_element else 'Unknown'
                news_titles.append((elem.text.strip(), elem['href'], date_text))

    # 2. 뉴스 영역에서 링크 제목 및 날짜 찾기
    news_areas = soup.find_all(['div', 'li', 'article'], class_=lambda c: c and any(x in c for x in ['news', 'article', 'list']))
    for area in news_areas:
        links = area.find_all('a')
        for link in links:
            if link.text.strip() and len(link.text.strip()) > 15:
                date_element = link.find_next_sibling('span', class_=lambda c: c if c in date_classes else None) or \
                               link.find_next('span', class_=lambda c: c if c in date_classes else None) or \
                               link.find_parent().find('span', class_=lambda c: c if c in date_classes else None) or \
                               link.find_parent().find('div', class_=lambda c: c if c in date_classes else None)
                date_text = date_element.text.strip() if date_element else 'Unknown'
                news_titles.append((link.text.strip(), link['href'], date_text))

    # 3. 네이버 뉴스 링크에서 제목 및 날짜 추출 (날짜는 상세 페이지에서 확인해야 함)
    naver_news_links = soup.find_all('a', href=lambda h: h and 'news.naver.com' in h)
    for link in naver_news_links:
        if link.text.strip() and len(link.text.strip()) > 15:
            # BeautifulSoup에서 바로 날짜를 추출하기 어려우므로 'Unknown'으로 초기화
            news_titles.append((link.text.strip(), link['href'], 'Unknown'))

    # 4. JavaScript로 제목 및 날짜 추출
    news_titles_js = driver.execute_script("""
        let titles = [];
        document.querySelectorAll('a').forEach(link => {
            if (link.href.includes('news.naver.com') &&
                link.textContent.trim().length > 15) {
                let parent = link.closest('li') || link.closest('div');
                let dateElement = parent?.querySelector('span.info, span.date, div.info, div.date');
                let date = dateElement?.textContent.trim() || 'Unknown';
                titles.push({
                    title: link.textContent.trim(),
                    link: link.href,
                    date: date
                });
            }
        });
        // 추가: 클래스 기반 탐색 및 날짜 추출 시도
        document.querySelectorAll('[class*="news"], [class*="title"], [class*="tit"]').forEach(elem => {
            if (elem.tagName === 'A' && elem.textContent.trim().length > 15) {
                let parent = elem.closest('li') || elem.closest('div');
                let dateElement = parent?.querySelector('span.info, span.date, div.info, div.date');
                let date = dateElement?.textContent.trim() || 'Unknown';
                titles.push({
                    title: elem.textContent.trim(),
                    link: elem.href,
                    date: date
                });
            }
        });
        return [...new Set(titles.map(JSON.stringify))].map(JSON.parse);
    """)

    # JavaScript 결과 추가
    for item in news_titles_js:
        news_titles.append((item['title'], item['link'], item['date']))

    # 5. 텍스트 노드 추출 (원본의 마지막 대체 방법) - 날짜 정보 없음
    if not news_titles:
        print("기본 방법으로 기사를 찾지 못했습니다. 텍스트 노드로 시도...")
        all_text_nodes = driver.execute_script("""
            function getAllTextNodes() {
                let result = [];
                function getTextNodes(node) {
                    if (node.nodeType === 3 && node.textContent.trim().length > 20) {
                        result.push(node.textContent.trim());
                    } else {
                        for (let i = 0; i < node.childNodes.length; i++) {
                            getTextNodes(node.childNodes[i]);
                        }
                    }
                }
                getTextNodes(document.body);
                return [...new Set(result)];
            }
            return getAllTextNodes();
        """)
        print(f"텍스트 노드 수: {len(all_text_nodes)}")
        for text in all_text_nodes:
            if len(text) > 20 and len(text) < 100:
                news_titles.append((text, '', 'Unknown'))

    # 중복 제거 및 필터링
    unique_titles = list(set((title, link, date) for title, link, date in news_titles))
    filtered_titles = []
    excluded_phrases = ['언론사', '구독하기', '메뉴', '로그인', '뉴스홈', '뉴스스탠드', '전체언론사', '정렬', '네이버뉴스']

    for title, link, date_text in unique_titles:
        if (len(title) > 15 and
            not any(phrase in title for phrase in excluded_phrases)):
            is_today = today in date_text or "전" in date_text # 오늘 날짜 필터링 강화 (더 정확한 비교를 위해 'Unknown' 제거)
            if is_today:
                filtered_titles.append((title, link, date_text))

    # 결과 출력
    
    if filtered_titles:
        for idx, (title, link, date_text) in enumerate(filtered_titles, 1):
            print(f"{idx}. {title}\n  → {link}\n  📅 날짜: {date_text}")
    else:
        
        print("\n디버깅: 추출된 모든 제목 (오늘):")
        for title, link, date_text in unique_titles:
            is_today = today in date_text or "전" in date_text
            if is_today:
                print(f"- {title} | 링크: {link} | 날짜: {date_text}")
        print("\n디버깅: 추출된 모든 제목:") # 모든 추출된 제목 추가 출력
        for title, link, date_text in unique_titles:
            print(f"- {title} | 링크: {link} | 날짜: {date_text}")

except TimeoutException as e:
    print(f"시간 초과 오류: {e}")
    driver.save_screenshot("timeout_error.png")
    print("CAPTCHA 감지 가능성. 브라우저에서 CAPTCHA를 해결한 후 Enter를 누르세요.")
    input()
    page_source = driver.page_source
    with open("page_source_timeout.html", "w", encoding="utf-8") as f:
        f.write(page_source)

except Exception as e:
    print(f"실행 중 오류 발생: {e}")
    driver.save_screenshot("error.png")

finally:
    driver.quit()
    print("브라우저 종료 완료")

2. 코드 실행 결과: terminal 창에 수자원공사의 당일 뉴스기사 수집 결과가 다음과 같이 뜬다.

3. 내부 수집 기사 vs. 크롤링 수집 기사

내가 python을 통해 크롤링하여 수집한 기사와 내부에서 수집된 기사를 비교해보고 싶었다.

얼마나 정확하게 크롤링을 하였고, 오늘자 기사를 찾아주는 것이 맞는지 확인해보고 싶었다.

Chat GPT 의 도움을 받아 중복된 기사를 하나의 기사로 보고 제대로 크롤링 된 것인지 확인하였다.비교 결과는 다음과 같으며, 정확도가 매우 높은 크롤링 코드를 만든 것임을 확인할 수 있다.

(조간에 수집한 내부 기사이기에 내가 크롤링한 시점의 몇몇 기사는 조간기사에 포함 안됐고, 네이버에서만 기사를 수집했다는 점을 감안하면 정확도가 높다고 판단된다.)