본문 바로가기
개인 프로젝트/데이터 분석 프로젝트

[캐글] Brazilian EDA 데이터 분석

by 응_비 2025. 7. 23.

https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce?select=olist_products_dataset.csv

Brazilian E-Commerce Public Dataset by Olist

100,000 Orders with product, customer and reviews info

www.kaggle.com

 

PROC SQL OUTOBS=10;
CREATE TABLE Brazilian AS
SELECT DISTINCT A.REVIEW_SCORE
FROM WORK.OLIST_ORDER_REVIEWS_DATASET AS A
ORDER BY 1
;QUIT;

PROC SQL;
CREATE TABLE Brazilian_2 AS
SELECT ORDER_PURCHASE_TIMESTAMP AS YM
		  , COUNT(DISTINCT ORDER_ID) AS ORDER_COUNT
FROM WORK.OLIST_ORDERS_DATASET
GROUP BY 1
ORDER BY 1;
QUIT;

/* DISTINCT 시 데이터값 다름 (N)*/
PROC SQL;
CREATE TABLE Brazilian_3 AS
SELECT COUNT(*) AS CNT
		  , COUNT(ORDER_ID) AS ORDER_COUNT_NOT_DISTINCT
		  , COUNT(DISTINCT ORDER_ID) AS ORDER_COUNT
FROM WORK.OLIST_ORDER_ITEMS_DATASET
;QUIT;

/* 모두 동일(1) */
PROC SQL;
CREATE TABLE Brazilian_3 AS
SELECT COUNT(*) AS CNT
		  , COUNT(ORDER_ID) AS ORDER_COUNT_NOT_DISTINCT
		  , COUNT(DISTINCT ORDER_ID) AS ORDER_COUNT
FROM WORK.OLIST_ORDERS_DATASET
;QUIT;

PROC SQL;
CREATE TABLE Brazilian_3 AS
SELECT ORDER_ID
FROM WORK.OLIST_ORDER_ITEMS_DATASET
WHERE ORDER_ID = '00010242fe8c5a6d1ba2dd792cb16214'
;QUIT;

/* 3. 카테고리별 주문정보 확인 */
PROC SQL;
CREATE TABLE WORK.TMP1 AS
SELECT COUNT(DISTINCT A.ORDER_ID) AS CNT
		  , SUM(A.PRICE) AS PRICE
		  , MEAN(A.FREIGHT_VALUE) AS TOTAL_VALUE
FROM WORK.OLIST_ORDER_ITEMS_DATASET AS A
LEFT JOIN  WORK.OLIST_ORDERS_DATASET AS B
ON A.ORDER_ID = B.ORDER_ID

LEFT JOIN WORK.OLIST_PRODUCTS_DATASET AS C
ON A.PRODUCT_ID = C.PRODUCT_ID

LEFT JOIN WORK.PRODUCT_CATEGORY_NAME_TRANSLATIO AS D
ON C.PRODUCT_CATEGORY_NAME = D.PRODUCT_CATEGORY_NAME

;QUIT;

 

1. 월별 주문 건수 확인 (count)

/* 어떤 카테고리가 가장 많이 팔렸나? */
/* 카테고리별 주문정보 확인  */

/* 1단계: DATETIME 값에서 날짜만 추출 후 YYYY-MM 생성 */
data orders_monthly;
    set WORK.OLIST_ORDERS_DATASET;

    /* DATETIME → DATE 변환 */
    purchase_date = datepart(order_purchase_timestamp);
    format purchase_date date9.;  /* 예: 02OCT2017 */

    /* DATE → YYYYMM 문자 생성 */
    YM = put(intnx('month', purchase_date, 0), yymmn6.);
    format YM $7.;
run;

/* 2단계: 월별 고유 주문 수 집계 */
proc sql;
    create table monthly_order_count as
    select 
        YM,
        count(distinct order_id) as order_count
    from 
        orders_monthly
    group by 
        YM
    order by 
        YM;
quit;

/* 하나의 쿼리 안에 넣기 */
proc sql;
    create table monthly_order_count as
    select 
        put(intnx('month', datepart(order_purchase_timestamp), 0), yymmn6.) as YM format=$7.,
        count(distinct order_id) as order_count
    from 
        WORK.OLIST_ORDERS_DATASET
    group by 
        calculated YM
    order by 
        calculated YM;
quit;

 
2. DISTINCT 활용해서 테이블간 1:N 관계 파악

/* 모두 동일(1) */
PROC SQL;
CREATE TABLE Brazilian_3 AS
SELECT COUNT(*) AS CNT
		  , COUNT(ORDER_ID) AS ORDER_COUNT_NOT_DISTINCT
		  , COUNT(DISTINCT ORDER_ID) AS ORDER_COUNT
FROM WORK.OLIST_ORDERS_DATASET
;QUIT;

 
3. 카테고리별 주문정보 확인
Q (어떤 카테고리가 가장 많이 팔렸나?)

PROC SQL;
CREATE TABLE WORK.TMP1 AS
SELECT DISTINCT D.PRODUCT_CATEGORY_NAME_ENGLISH
		  , COUNT(DISTINCT A.ORDER_ID) AS CNT
		  , SUM(A.PRICE) AS PRICE
		  , MEAN(A.FREIGHT_VALUE) AS TOTAL_VALUE
FROM WORK.OLIST_ORDER_ITEMS_DATASET AS A
LEFT JOIN  WORK.OLIST_ORDERS_DATASET AS B
ON A.ORDER_ID = B.ORDER_ID

LEFT JOIN WORK.OLIST_PRODUCTS_DATASET AS C
ON A.PRODUCT_ID = C.PRODUCT_ID

LEFT JOIN WORK.PRODUCT_CATEGORY_NAME_TRANSLATIO AS D
ON C.PRODUCT_CATEGORY_NAME = D.PRODUCT_CATEGORY_NAME

WHERE 1=1
AND D.PRODUCT_CATEGORY_NAME_ENGLISH IS NOT NULL

GROUP BY 1
ORDER BY 2
;QUIT;

 
4. olist_order_items_dataset 테이블 컬럼인 가격(price)과 화물가치(freight_value)의 합이 총 주문금액인가?

PROC SQL;
CREATE TABLE WORK.TMP2 AS
SELECT A.ORDER_ID
		  , SUM(A.PRICE) AS SUM
		  , SUM(A.FREIGHT_VALUE) AS SUM_VALUE
		  , SUM(A.PRICE) + SUM(A.FREIGHT_VALUE) AS TOTAL_ORDER
FROM WORK.OLIST_ORDER_ITEMS_DATASET AS A
WHERE 1=1
GROUP BY 1
;QUIT;

 
5. 가장 많이 주문한 도시는 어디인가?

/* 5. 가장 많이 주문한 도시는 어디인가?  */
PROC SQL;
CREATE TABLE WORK.TMP3 AS
SELECT B.CUSTOMER_CITY
		  , COUNT(DISTINCT A.ORDER_ID) AS ORDER_COUNT
		  , COUNT(DISTINCT B.CUSTOMER_ID) AS CUSTOMER_CNT
		  , COUNT(DISTINCT B.CUSTOMER_UNIQUE_ID) AS CUSTOMER_UNIQUE_CNT
FROM WORK.OLIST_ORDERS_DATASET AS A
LEFT JOIN WORK.OLIST_CUSTOMERS_DATASET AS B
ON A.CUSTOMER_ID = B.CUSTOMER_ID
GROUP BY B.CUSTOMER_CITY
ORDER BY 2 DESC
;QUIT;

 
6. 누가 가장 많이 샀는가?

/* 주문 수 구간별 고객 수 집계 */
proc sql;
    create table customer_segment_summary as
    select 
        case 
            when order_count >= 5 then '5_Order_Over'
            when order_count = 4 then '4_Order'
            when order_count = 3 then '3_Order'
            when order_count = 2 then '2_Order'
            when order_count = 1 then '1_Order'
        end as order_segment,
        count(distinct customer_unique_id) as customer_cnt
    from 
        WORK.OLIST_CUSTOMERS_DATASET
    group by 
        case 
            when order_count >= 5 then '5_Order_Over'
            when order_count = 4 then '4_Order'
            when order_count = 3 then '3_Order'
            when order_count = 2 then '2_Order'
            when order_count = 1 then '1_Order'
        end
    order by 
        order_segment;
quit;

 
7. 함께 자주 구매하는 제품은 무엇인가?

 

 
* RFM(Recency, Frequency, Monetary) 분석 테이블 : 고객 세분화(segmentation)


- Recency: 최근에 구매했는가?


- Frequency: 자주 구매했는가?


- Monetary: 구매 금액은 얼마인가?

댓글