- numpy - pandas - matplotlib - scikit-learn

Predicting whether a visitor will buy a car using AI in PyScript 🐍

（ここでは来店客の年齢・年収から車を購入するかどうかを予測します)

Choose Test Split: (分割するテスト用データの割合) Answer between 0 to 1.

Choose Model: (モデルを選択)

Support Vector Machine

Random Forest

Decision Tree

Naive Bayes

Logistic Regression

Buyer's Parameters (見込客の情報)

Age: (年齢) Answer age of buyer. Annual Salary (US $): (年収: 米ドル) Answer annual salary of buyer.

# import python libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyodide.http import open_url from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # SVC: Linear Support Vector Classification import warnings warnings.simplefilter('ignore') # define global variables sales_data = pd.DataFrame() X_train = 0; X_test = 0; y_train = 0; y_test = 0 sc = StandardScaler() model = SVC() model_name = '' def import_data(*args, **kwargs): # Import the data global sales_data csv_file = '../data/article078/CarSalesData.csv' url_content = open_url(csv_file) df = pd.read_csv(url_content) if df.shape[0] == 400: document.getElementById("visualize").classList.remove("disabled"); document.getElementById("visualize").disabled = False; document.getElementById("train").classList.remove("disabled"); document.getElementById("train").disabled = False; document.getElementById("plot").classList.remove("disabled"); document.getElementById("plot").disabled = False; document.getElementById("score").classList.remove("disabled"); document.getElementById("score").disabled = False; document.getElementById("predict").classList.remove("disabled"); document.getElementById("predict").disabled = False; console.warn(f'df.shape: {df.shape}') sales_data = df pyscript.write('output', f'import_data(): df.shape {df.shape}') else: console.error(f'pd.read_csv() error: df.shape: {df.shape}') pyscript.write('output', f'import_data(): pd.read_csv() network error df.shape {df.shape}') def visualize_data(*args, **kwargs): # Visualize car sales data df = sales_data X = df.iloc[:, [2,3]].values # Age[0], EstimatedSalary[1] y = df.iloc[:, 4].values # Purchased (0 or 1) plt.style.use('dark_background') fig, ax = plt.subplots(1, 1, figsize=(10,6)) ax.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='r', label='0:Not Purchased') # Age ax.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='g', label='1:Purchased') # Salary ax.legend() ax.set(xlabel='Age', ylabel='Estimated Salary ($)', title='Plot input data\n(Age vs Salary)') pyscript.write('output', fig) def train_model(*args, **kwargs): global sc, model, model_name global X_train, X_test, y_train, y_test model_type = document.querySelector('input[name="modelSelection"]:checked').value # SVM, RF, DT, NB, LR if model_type == 'SVM': model = SVC(gamma='auto') model_name = 'Support Vector Classification' elif model_type == 'RF': model = RandomForestClassifier(random_state=1, n_estimators=100) model_name = 'Random Forest Classifier' elif model_type == 'DT': model = DecisionTreeClassifier() model_name = 'Decision Tree Classifier' elif model_type == 'NB': model = GaussianNB() model_name = 'Gaussian Naive Bayes' elif model_type == 'LR': model = LogisticRegression(random_state=0) model_name = 'Logistic Regression' console.warn(f'model= {model_type} - {model_name}') # Prepare the data test_split = float(document.getElementById("test_split").value) # 0.25 console.warn(f'test_split= {test_split:.2f}') df = sales_data X = df.iloc[:, [2,3]].values # Age[0], EstimatedSalary[1] y = df.iloc[:, 4].values # Purchased (0 or 1) # Split the input data (75:25) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=0) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Train the model model.fit(X_train, y_train) console.warn('model.fit(x, y) done!') pyscript.write('output', f'train_model(): done!') def plot_boundary(*args, **kwargs): # Plot the dicision boundary plt.style.use('dark_background') fig, ax = plt.subplots(1, 1, figsize=(10,8)) X = X_train y = y_train # Set min and max values and give it some padding: X: Age, Salary x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 # Age: x_min=17.5, x_max=60.5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 # Salary: y_min=14999.5, y_max=150000.5 h = 0.1 # Generate a grid of points with distance h between them xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Predict the whole grid Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Plot the contour and training examples ax.contourf(xx, yy, Z, cmap=plt.cm.Spectral) ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral) # X=Age, Y=Salary ax.set(title=f'{model_name}\n(Decision Boundary)') pyscript.write('output', fig) def accuracy(*args, **kwargs): # Evaluate the prediction results y_pred = model.predict(X_test) score = accuracy_score(y_test, y_pred) * 100 # 93% console.warn(f'score: {score}') pyscript.write('output', f'accuracy(): score= {score}') def predict(*args, **kwargs): # Make Predictions age = int(document.getElementById("age").value) # 40 salary = int(document.getElementById("salary").value) # 100,000 X_test = [[age, salary]] # Age=40, EstimakedSalary=$100,000 X_test = sc.transform(X_test) y_pred = model.predict(X_test) result = 'A visitor will be buy it (購入する)' if y_pred[0] == 1 else 'A visitor will not buy it (購入しない)' pyscript.write('output', f'age={age}, salary={salary:,.0f} ▶ {result}')