처음으로 올리는 파일입니다.

December 15, 2022

import torch

torch.cuda.is_available()

True

from pycaret.datasets import get_data
dataset = get_data('juice')

	Id	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
0	1	CH	237	1	1.75	1.99	0.00	0.0	0	0.500000	1.99	1.75	0.24	No	0.000000	0.000000	0.24	1
1	2	CH	239	1	1.75	1.99	0.00	0.3	1	0.600000	1.69	1.75	-0.06	No	0.150754	0.000000	0.24	1
2	3	CH	245	1	1.86	2.09	0.17	0.0	0	0.680000	2.09	1.69	0.40	No	0.000000	0.091398	0.23	1
3	4	MM	227	1	1.69	1.69	0.00	0.0	0	0.400000	1.69	1.69	0.00	No	0.000000	0.000000	0.00	1
4	5	CH	228	7	1.69	1.69	0.00	0.0	0	0.956535	1.69	1.69	0.00	Yes	0.000000	0.000000	0.00	0

from pycaret.classification import *
setup_clf = setup(data=dataset, target='Purchase')

	Description	Value
0	session_id	288
1	Target	Purchase
2	Target Type	Binary
3	Label Encoded	CH: 0, MM: 1
4	Original Data	(1070, 19)
5	Missing Values	False
6	Numeric Features	13
7	Categorical Features	5
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(748, 17)
12	Transformed Test Set	(322, 17)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	ad07
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Remove Perfect Collinearity	True
45	Clustering	False
46	Clustering Iteration	None
47	Polynomial Features	False
48	Polynomial Degree	None
49	Trignometry Features	False
50	Polynomial Threshold	None
51	Group Features	False
52	Feature Selection	False
53	Feature Selection Method	classic
54	Features Selection Threshold	None
55	Feature Interaction	False
56	Feature Ratio	False
57	Interaction Threshold	None
58	Fix Imbalance	False
59	Fix Imbalance Method	SMOTE

rf = create_model('rf',fold=5)

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8000	0.8649	0.7018	0.7547	0.7273	0.5697	0.5706
1	0.7933	0.8453	0.7895	0.7031	0.7438	0.5716	0.5743
2	0.8000	0.8817	0.7018	0.7547	0.7273	0.5697	0.5706
3	0.7248	0.8169	0.6429	0.6316	0.6372	0.4156	0.4156
4	0.8054	0.8380	0.7679	0.7288	0.7478	0.5895	0.5901
Mean	0.7847	0.8494	0.7207	0.7146	0.7167	0.5432	0.5443
Std	0.0302	0.0223	0.0524	0.0457	0.0406	0.0643	0.0647

top5 = compare_models(sort='Accuracy', n_select=5)

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lr	Logistic Regression	0.8155	0.8868	0.7277	0.7756	0.7491	0.6036	0.6061	0.1880
ridge	Ridge Classifier	0.8155	0.0000	0.7523	0.7627	0.7557	0.6077	0.6095	0.0070
lda	Linear Discriminant Analysis	0.8101	0.8876	0.7346	0.7596	0.7448	0.5940	0.5963	0.0060
gbc	Gradient Boosting Classifier	0.8007	0.8756	0.7203	0.7459	0.7301	0.5725	0.5755	0.0230
ada	Ada Boost Classifier	0.7968	0.8640	0.7060	0.7462	0.7228	0.5629	0.5661	0.0210
rf	Random Forest Classifier	0.7888	0.8553	0.7346	0.7189	0.7218	0.5523	0.5571	0.1100
lightgbm	Light Gradient Boosting Machine	0.7860	0.8620	0.7314	0.7149	0.7199	0.5473	0.5506	0.0160
et	Extra Trees Classifier	0.7821	0.8178	0.7347	0.7053	0.7147	0.5391	0.5447	0.1090
dt	Decision Tree Classifier	0.7540	0.7529	0.7031	0.6696	0.6821	0.4822	0.4863	0.0050
nb	Naive Bayes	0.7499	0.8248	0.7527	0.6475	0.6946	0.4854	0.4913	0.0060
knn	K Neighbors Classifier	0.7314	0.7596	0.5930	0.6646	0.6242	0.4166	0.4202	0.0280
dummy	Dummy Classifier	0.6217	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0050
svm	SVM - Linear Kernel	0.5240	0.0000	0.4000	0.1512	0.2194	0.0000	0.0000	0.0050
qda	Quadratic Discriminant Analysis	0.3717	0.4771	0.9034	0.3591	0.5110	-0.0424	-0.0494	0.0060

top5

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=288, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                 max_iter=None, normalize=False, random_state=288, solver='auto',
                 tol=0.001),
 LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                            solver='svd', store_covariance=False, tol=0.0001),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='deprecated',
                            random_state=288, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=288)]

tuned_top5 = [tune_model(i) for i in top5]

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8267	0.9003	0.7586	0.7857	0.7719	0.6322	0.6325
1	0.8667	0.9067	0.7241	0.9130	0.8077	0.7077	0.7189
2	0.8533	0.9153	0.8276	0.8000	0.8136	0.6927	0.6930
3	0.7867	0.8894	0.7500	0.7000	0.7241	0.5506	0.5514
4	0.8667	0.9362	0.8929	0.7812	0.8333	0.7230	0.7275
5	0.8400	0.9236	0.7500	0.8077	0.7778	0.6530	0.6541
6	0.7467	0.8469	0.5714	0.6957	0.6275	0.4383	0.4432
7	0.7600	0.8100	0.6429	0.6923	0.6667	0.4796	0.4804
8	0.8649	0.8769	0.7500	0.8750	0.8077	0.7045	0.7094
9	0.7568	0.8688	0.7143	0.6667	0.6897	0.4900	0.4908
Mean	0.8168	0.8874	0.7382	0.7717	0.7520	0.6072	0.6101
Std	0.0468	0.0362	0.0839	0.0783	0.0672	0.1024	0.1035

blender_top5 = blend_models(estimator_list=tuned_top5)

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8667	0.0000	0.8621	0.8065	0.8333	0.7224	0.7235
1	0.8267	0.0000	0.7586	0.7857	0.7719	0.6322	0.6325
2	0.8667	0.0000	0.8621	0.8065	0.8333	0.7224	0.7235
3	0.7867	0.0000	0.7857	0.6875	0.7333	0.5569	0.5603
4	0.8800	0.0000	0.8929	0.8065	0.8475	0.7490	0.7516
5	0.8533	0.0000	0.7857	0.8148	0.8000	0.6843	0.6846
6	0.8000	0.0000	0.7143	0.7407	0.7273	0.5695	0.5697
7	0.7333	0.0000	0.6786	0.6333	0.6552	0.4382	0.4389
8	0.8514	0.0000	0.7143	0.8696	0.7843	0.6726	0.6801
9	0.7568	0.0000	0.7143	0.6667	0.6897	0.4900	0.4908
Mean	0.8221	0.0000	0.7768	0.7618	0.7676	0.6237	0.6256
Std	0.0480	0.0000	0.0706	0.0725	0.0615	0.1005	0.1010

final_model = finalize_model(blender_top5)
prediction = predict_model(final_model, data=dataset.iloc[-100:])

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	Voting Classifier	0	0.8000	0	0	0	0	0

prediction

	Id	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialCH	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE	Label
970	971	MM	240	1	1.75	1.99	0.0	0.30	0	1	0.224526	1.69	1.75	-0.06	No	0.150754	0.000000	0.24	1	MM
971	972	MM	241	1	1.86	1.99	0.0	0.30	0	1	0.179621	1.69	1.86	-0.17	No	0.150754	0.000000	0.13	1	MM
972	973	CH	242	1	1.86	1.99	0.0	0.30	0	1	0.143697	1.69	1.86	-0.17	No	0.150754	0.000000	0.13	1	MM
973	974	MM	243	1	1.86	1.99	0.0	0.80	0	1	0.314957	1.19	1.86	-0.67	No	0.402010	0.000000	0.13	1	MM
974	975	MM	244	1	1.86	2.09	0.0	0.00	0	0	0.251966	2.09	1.86	0.23	No	0.000000	0.000000	0.23	1	MM
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1065	1066	CH	252	7	1.86	2.09	0.1	0.00	0	0	0.587822	2.09	1.76	0.33	Yes	0.000000	0.053763	0.23	0	CH
1066	1067	CH	256	7	1.86	2.18	0.0	0.00	0	0	0.670258	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0	CH
1067	1068	MM	257	7	1.86	2.18	0.0	0.00	0	0	0.736206	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0	CH
1068	1069	CH	261	7	1.86	2.13	0.0	0.24	0	0	0.588965	1.89	1.86	0.03	Yes	0.112676	0.000000	0.27	0	CH
1069	1070	CH	270	1	1.86	2.18	0.0	0.00	0	0	0.671172	2.18	1.86	0.32	No	0.000000	0.000000	0.32	1	CH

100 rows × 20 columns

from pycaret.utils import check_metric
check_metric(prediction['Purchase'],prediction['Label'],metric='Accuracy')

0.81

Twitter Facebook LinkedIn

HyungWoo Song

처음으로 올리는 파일입니다.

공유하기

참고

Layout: Header Image (Horizontal)

첫 포스팅입니다. 설레네요.