<aside> 📌 Task : 전처리 및 이상치 및 파생컬럼
</aside>
import pandas as pd
import numpy as np
# 1️⃣ 데이터 불러오기
df = pd.read_csv('/content/AB_NYC_2019.csv') # 파일 경로 주의!
# 2️⃣ 로그 가격 생성
df['log_price'] = np.log1p(df['price'])
# 3️⃣ 지역별 평균 및 상위 75% 가격 계산
price_bounds = df.groupby('neighbourhood_group')['price'].describe(percentiles=[0.75])[['mean', '75%']]
price_bounds.columns = ['mean_price', 'upper_bound']
# 4️⃣ price_category 생성
def classify_price(row):
group = row['neighbourhood_group']
room = row['room_type']
price = row['price']
if group not in price_bounds.index:
return 'unknown'
mean = price_bounds.loc[group, 'mean_price']
upper = price_bounds.loc[group, 'upper_bound']
if room == 'Shared room' and group != 'Manhattan' and price < mean:
return 'low'
elif room == 'Private room' and group == 'Brooklyn' and mean <= price <= upper:
return 'mid'
elif room == 'Entire home/apt' and group == 'Manhattan' and price > upper:
return 'high'
else:
return 'other'
df['price_category'] = df.apply(classify_price, axis=1)
# 5️⃣ 도심 vs 외곽
def is_central(row):
lat, lon = row['latitude'], row['longitude']
if (
((40.70 <= lat <= 40.80) and (-74.02 <= lon <= -73.95)) or
((40.69 <= lat <= 40.70) and (-73.99 <= lon <= -73.98)) or
((40.71 <= lat <= 40.73) and (-73.97 <= lon <= -73.95))
):
return '도심'
else:
return '외곽'
df['is_central'] = df.apply(is_central, axis=1)
# 6️⃣ 숙박일수 분류
def classify_stay(nights):
if nights <= 7:
return '단기'
elif 8 <= nights <= 29:
return '중기'
else:
return '장기'
df['stay_type'] = df['minimum_nights'].apply(classify_stay)
# 7️⃣ 호스트 범위 분류
def classify_host(listings):
if listings <= 2:
return '일반'
elif 3 <= listings <= 10:
return '반전문'
else:
return '전문'
df['host_type'] = df['calculated_host_listings_count'].apply(classify_host)
# 8️⃣ 인기도 분류 (예약 가능일수 > 0만 대상)
df_pop = df[df['availability_365'] > 0].copy()
quantiles = df_pop.groupby('neighbourhood_group').agg({
'availability_365': lambda x: np.percentile(x, 25),
'number_of_reviews': lambda x: np.percentile(x, 75)
}).rename(columns={
'availability_365': 'avail_25pct',
'number_of_reviews': 'review_75pct'
})
def classify_popularity(row):
group = row['neighbourhood_group']
if group not in quantiles.index:
return 'unknown'
avail_thres = quantiles.loc[group, 'avail_25pct']
review_thres = quantiles.loc[group, 'review_75pct']
if row['availability_365'] <= avail_thres and row['number_of_reviews'] >= review_thres:
return '인기 숙소'
elif row['availability_365'] >= avail_thres and row['number_of_reviews'] <= review_thres:
return '인기 적음'
else:
return '기타'
df['popularity'] = df.apply(classify_popularity, axis=1)
# ✅ 확인용 출력 (파생 컬럼 확인)
df[['price', 'room_type', 'neighbourhood_group', 'price_category',
'log_price', 'is_central', 'stay_type', 'host_type', 'popularity']].head()

<aside> 📌 메인 가설 1 증명
</aside>
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
# 1️⃣ 교차표 만들기
ct = pd.crosstab(df['room_type'], df['popularity'])
print("📊 교차표:\\n", ct)
# 2️⃣ 카이제곱 독립성 검정
chi2, p, dof, expected = stats.chi2_contingency(ct)
print("\\n🧪 [카이제곱 검정 결과]")
print(f"Chi² 통계량: {chi2:.2f}")
print(f"p-value: {p:.4f}")
print(f"자유도: {dof}")
print("\\n기대 빈도표:\\n", pd.DataFrame(expected,
index=ct.index,
columns=ct.columns))
# 3️⃣ 비율 기준 교차표
ct_norm = ct.div(ct.sum(axis=1), axis=0) # 행 기준 정규화
# 4️⃣ 그래프 그리기
ax = ct_norm.plot(kind='bar', stacked=True, figsize=(8, 5), colormap='Set2')
plt.title('방 타입별 인기도 분포 (비율)')
plt.ylabel('비율 (%)')
plt.xticks(rotation=0)
plt.legend(title='popularity')
plt.ylim(0, 1.05)
# 5️⃣ 퍼센트 (%) 라벨 추가 (소수점 1자리)
for i, row in enumerate(ct_norm.values):
cumulative = 0
for j, value in enumerate(row):
if value > 0.01: # 너무 작으면 생략
plt.text(i, cumulative + value / 2,
f'{value * 100:.1f}%',
ha='center', va='center', fontsize=10)
cumulative += value
plt.tight_layout()
plt.show()


방 타입 (room_type) |
인기 숙소 (%) | 보통 숙소 (%) | 비인기 숙소 (%) |
|---|---|---|---|
| Entire home/apt | 4.8% | 60.3% | 35.0% |
| Private room | 5.7% | 58.6% | 35.8% |
| Shared room | 4.1% | 43.5% | 52.3% |