<aside> 📌 Task : 전처리 및 이상치 및 파생컬럼

</aside>

[ ] 전처리 및 이상치 및 파생컬럼

import pandas as pd
import numpy as np

# 1️⃣ 데이터 불러오기
df = pd.read_csv('/content/AB_NYC_2019.csv')  # 파일 경로 주의!

# 2️⃣ 로그 가격 생성
df['log_price'] = np.log1p(df['price'])

# 3️⃣ 지역별 평균 및 상위 75% 가격 계산
price_bounds = df.groupby('neighbourhood_group')['price'].describe(percentiles=[0.75])[['mean', '75%']]
price_bounds.columns = ['mean_price', 'upper_bound']

# 4️⃣ price_category 생성
def classify_price(row):
    group = row['neighbourhood_group']
    room = row['room_type']
    price = row['price']
    if group not in price_bounds.index:
        return 'unknown'
    mean = price_bounds.loc[group, 'mean_price']
    upper = price_bounds.loc[group, 'upper_bound']
    if room == 'Shared room' and group != 'Manhattan' and price < mean:
        return 'low'
    elif room == 'Private room' and group == 'Brooklyn' and mean <= price <= upper:
        return 'mid'
    elif room == 'Entire home/apt' and group == 'Manhattan' and price > upper:
        return 'high'
    else:
        return 'other'

df['price_category'] = df.apply(classify_price, axis=1)

# 5️⃣ 도심 vs 외곽
def is_central(row):
    lat, lon = row['latitude'], row['longitude']
    if (
        ((40.70 <= lat <= 40.80) and (-74.02 <= lon <= -73.95)) or
        ((40.69 <= lat <= 40.70) and (-73.99 <= lon <= -73.98)) or
        ((40.71 <= lat <= 40.73) and (-73.97 <= lon <= -73.95))
    ):
        return '도심'
    else:
        return '외곽'

df['is_central'] = df.apply(is_central, axis=1)

# 6️⃣ 숙박일수 분류
def classify_stay(nights):
    if nights <= 7:
        return '단기'
    elif 8 <= nights <= 29:
        return '중기'
    else:
        return '장기'

df['stay_type'] = df['minimum_nights'].apply(classify_stay)

# 7️⃣ 호스트 범위 분류
def classify_host(listings):
    if listings <= 2:
        return '일반'
    elif 3 <= listings <= 10:
        return '반전문'
    else:
        return '전문'

df['host_type'] = df['calculated_host_listings_count'].apply(classify_host)

# 8️⃣ 인기도 분류 (예약 가능일수 > 0만 대상)
df_pop = df[df['availability_365'] > 0].copy()

quantiles = df_pop.groupby('neighbourhood_group').agg({
    'availability_365': lambda x: np.percentile(x, 25),
    'number_of_reviews': lambda x: np.percentile(x, 75)
}).rename(columns={
    'availability_365': 'avail_25pct',
    'number_of_reviews': 'review_75pct'
})

def classify_popularity(row):
    group = row['neighbourhood_group']
    if group not in quantiles.index:
        return 'unknown'
    avail_thres = quantiles.loc[group, 'avail_25pct']
    review_thres = quantiles.loc[group, 'review_75pct']
    if row['availability_365'] <= avail_thres and row['number_of_reviews'] >= review_thres:
        return '인기 숙소'
    elif row['availability_365'] >= avail_thres and row['number_of_reviews'] <= review_thres:
        return '인기 적음'
    else:
        return '기타'

df['popularity'] = df.apply(classify_popularity, axis=1)

# ✅ 확인용 출력 (파생 컬럼 확인)
df[['price', 'room_type', 'neighbourhood_group', 'price_category', 
    'log_price', 'is_central', 'stay_type', 'host_type', 'popularity']].head()

<aside> 📌 메인 가설 1 증명

</aside>

방 타입에 따라 숙소의 인기도 차이가 있을 것이다.

[ ] 메인가설 증명

import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

# 1️⃣ 교차표 만들기
ct = pd.crosstab(df['room_type'], df['popularity'])
print("📊 교차표:\\n", ct)

# 2️⃣ 카이제곱 독립성 검정
chi2, p, dof, expected = stats.chi2_contingency(ct)

print("\\n🧪 [카이제곱 검정 결과]")
print(f"Chi² 통계량: {chi2:.2f}")
print(f"p-value: {p:.4f}")
print(f"자유도: {dof}")
print("\\n기대 빈도표:\\n", pd.DataFrame(expected, 
                                    index=ct.index, 
                                    columns=ct.columns))

# 3️⃣ 비율 기준 교차표
ct_norm = ct.div(ct.sum(axis=1), axis=0)  # 행 기준 정규화

# 4️⃣ 그래프 그리기
ax = ct_norm.plot(kind='bar', stacked=True, figsize=(8, 5), colormap='Set2')
plt.title('방 타입별 인기도 분포 (비율)')
plt.ylabel('비율 (%)')
plt.xticks(rotation=0)
plt.legend(title='popularity')
plt.ylim(0, 1.05)

# 5️⃣ 퍼센트 (%) 라벨 추가 (소수점 1자리)
for i, row in enumerate(ct_norm.values):
    cumulative = 0
    for j, value in enumerate(row):
        if value > 0.01:  # 너무 작으면 생략
            plt.text(i, cumulative + value / 2,
                     f'{value * 100:.1f}%',
                     ha='center', va='center', fontsize=10)
        cumulative += value

plt.tight_layout()
plt.show()

✅ 그래프 해석 요약

방 타입 (`room_type`)	인기 숙소 (%)	보통 숙소 (%)	비인기 숙소 (%)
Entire home/apt	4.8%	60.3%	35.0%
Private room	5.7%	58.6%	35.8%
Shared room	4.1%	43.5%	52.3%

🧠 인사이트 요약

통계적으로 유의미한 차이는 있지만, 실제 수치는 근소한 차이