Python을 배워보자

박스오피스예매율실시간 크롤링

_Blue_Sky_ 2025. 9. 23. 22:34
728x90

 
박스오피스예매율실시간
[실시간 예매율]은 조회시간을 기준으로 상영 2시간 이후의 예매데이터를 실시간 집계하여 예매율 정보를 제공합니다.

실시간 예매율 산출기준 = A(예매매출액) / B(전체 예매매출액) * 100

- 예매매출액(A) : 조회시점 특정영화의 상영 2시간 이후의 발권데이터

- 전체 예매매출액(B) : 조회시점 모든 영화의 상영 2시간 이후의 발권데이터

- 상영시작시간을 기준으로 2시간 이내의 발권데이터는 집계대상에서 제외됩니다. (예 : 12시 상영영화를 11시에 발권한 경우는 제외, 10시 이전에 발권한 경우는 포함)

- 예매율은 조회시점에 따라 수시 변경됩니다.

영화구분을 전체영화로 선택한 경우, 상영작 전체를 대상으로 예매율을 집계합니다.

영화구분을 개봉영화로 선택한 경우, 상영작 전체 중 개봉작의 개봉일 이후 예매율만을 집계합니다.

import requests
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import sys

# 콘솔 출력 버퍼 크기 증가 (대량 출력 잘림 방지)
sys.stdout.reconfigure(line_buffering=True)


class KobisClient:
    def __init__(self, driver_path=None):  # Selenium Manager 기본 사용
        self.url = "https://www.kobis.or.kr/kobis/business/stat/boxs/findRealTicketList.do"
        self.driver_path = driver_path
        self.csrf_token = None  # CSRF 토큰 캐시
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': self.url,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
            'X-Requested-With': 'XMLHttpRequest'
        })

    def setup_driver(self):
        """Selenium WebDriver 설정"""
        options = webdriver.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--start-maximized')
        options.add_argument('--whitelisted-ips')
        options.add_experimental_option('excludeSwitches', ['enable-automation'])

        if self.driver_path:
            service = Service(self.driver_path)
            driver = webdriver.Chrome(service=service, options=options)
        else:
            driver = webdriver.Chrome(options=options)  # Selenium Manager
        driver.implicitly_wait(3)
        return driver

    def get_csrf_token(self):
        """Selenium으로 CSRF 토큰 추출 (캐시 사용)"""
        if self.csrf_token:  # 이미 토큰이 있으면 재사용
            return self.csrf_token

        driver = None
        try:
            driver = self.setup_driver()
            driver.get(self.url)
            time.sleep(3)  # JavaScript 로딩 대기

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            possible_token_names = ['_CSRFTOKEN', 'csrf_token', 'CSRFToken', 'token']
            for name in possible_token_names:
                token_input = soup.find('input', {'name': name})
                if token_input and token_input.get('value'):
                    self.csrf_token = token_input.get('value')  # 토큰 캐시
                    return self.csrf_token
            self.csrf_token = ''
            return ''
        except Exception:
            self.csrf_token = ''
            return ''
        finally:
            if driver:
                driver.quit()

    def get_box_office(self, all_movie_yn="N"):
        """
        실시간 예매율 데이터 가져오기

        Args:
            all_movie_yn (str): "Y" = 전체영화, "N" = 개봉영화 (기본값)

        Returns:
            list: 영화 리스트 [{"rank": "1", "title": "영화명", "reservation_rate": "12.3%", ...}, ...]
        """
        csrf_token = self.get_csrf_token()
        params = {
            '_CSRFTOKEN': csrf_token,
            'loadEnd': '0',
            'repNationCd': '',
            'areaCd': '0105001:0105002:0105003:0105004:0105005:0105006:0105007:0105008:0105009:0105010:0105011:0105012:0105013:0105014:0105015:0105016:',
            'repNationSelected': '',
            'totIssuAmtRatioOrder': '',
            'totIssuAmtOrder': '',
            'addTotIssuAmtOrder': '',
            'totIssuCntOrder': '',
            'totIssuCntRatioOrder': '',
            'addTotIssuCntOrder': '',
            'dmlMode': 'search',
            'repNationChk': '',
            'repNationKor': 'on',
            'wideareaAll': 'ALL',
            'wideareaCd': ['0105001', '0105011', '0105012', '0105015', '0105016', '0105013', '0105014', '0105002', '0105003', '0105005', '0105004', '0105007', '0105006', '0105009', '0105008',
                           '0105010'],
            'allMovieYn': all_movie_yn,
            'sMultiChk': 'YYY',
            'sNomal': 'Y',
            'sMulti': 'Y',
            'sIndie': 'Y'
        }

        # 다중 wideareaCd 처리
        params.update({f'wideareaCd': cd for cd in params['wideareaCd']})
        del params['wideareaCd']

        try:
            response = self.session.post(self.url, data=params, timeout=10)
            response.raise_for_status()

            # JSON 응답 시도
            try:
                data = json.loads(response.text)
                return self._parse_json(data)
            except json.JSONDecodeError:
                return self._parse_html(response.text)
        except Exception:
            return []

    def _parse_json(self, data):
        """JSON 데이터 파싱"""
        if 'movieList' not in data:
            return []

        movies = []
        for movie in data['movieList']:
            movies.append({
                'rank': movie.get('rank', ''),
                'title': movie.get('movieNm', ''),
                'open_date': movie.get('openDt', ''),
                'reservation_rate': movie.get('totIssuAmtRatio', ''),
                'reservation_sales': movie.get('totIssuAmt', ''),
                'total_sales': movie.get('addTotIssuAmt', ''),
                'reservation_audience': movie.get('totIssuCnt', ''),
                'total_audience': movie.get('addTotIssuCnt', '')
            })
        return movies

    def _parse_html(self, html):
        """HTML 테이블 파싱"""
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', class_='tbl_comm') or soup.find('table')
        if not table:
            return []

        rows = table.find_all('tr')[1:]  # 헤더 제외
        movies = []
        for row in rows:
            cols = [col.get_text(strip=True) for col in row.find_all(['td', 'th'])]
            if len(cols) >= 2 and '검색된 데이터가 존재하지 않습니다.' not in cols[0]:
                movies.append({
                    'rank': cols[0],
                    'title': cols[1],
                    'open_date': cols[2] if len(cols) > 2 else '',
                    'reservation_rate': cols[3] if len(cols) > 3 else '',
                    'reservation_sales': cols[4] if len(cols) > 4 else '',
                    'total_sales': cols[5] if len(cols) > 5 else '',
                    'reservation_audience': cols[6] if len(cols) > 6 else '',
                    'total_audience': cols[7] if len(cols) > 7 else ''
                })
        return movies


# 사용 예시
if __name__ == "__main__":
    client = KobisClient(driver_path=None)  # Selenium Manager 사용

    # 개봉영화만 (allMovieYn = "N")
    print("=== 개봉영화 실시간 예매율 ===")
    movies_new = client.get_box_office(all_movie_yn="N")
    for movie in movies_new:  # 전체 출력
        print(f"{movie['rank']:>2}위: {movie['title']:<20} 예매율: {movie['reservation_rate']:<8} 예매관객: {movie['reservation_audience']}")

    print(f"\n개봉영화 총 {len(movies_new)}편\n")

    # 전체영화 (allMovieYn = "Y")
    print("=== 전체영화 실시간 예매율 ===")
    movies_all = client.get_box_office(all_movie_yn="Y")
    for movie in movies_all:  # 전체 출력
        print(f"{movie['rank']:>2}위: {movie['title']:<20} 예매율: {movie['reservation_rate']:<8} 예매관객: {movie['reservation_audience']}")

    print(f"\n전체영화 총 {len(movies_all)}편")

    # 차이점 확인
    print(f"\n차이: {len(movies_all) - len(movies_new)}편")
728x90