In [2]:
import sagemaker
import boto3
from sagemaker import get_execution_role

container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name,\
            framework='xgboost', version='latest')
bucket='glue-sagemaker-demo-output'
prefix='web-marketing/processed'

#read train and validation input datasets
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/training/'\
                 .format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'\
                 .format(bucket, prefix), content_type='csv')

#train xgb model
sess = sagemaker.Session()
from sagemaker import get_execution_role

xgb = sagemaker.estimator.Estimator(
    container,
    role=get_execution_role(), 
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'\
    .format(bucket, prefix),
    sagemaker_session=sess
)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,
    objective='binary:logistic',
    num_round=100
)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

2022-10-24 07:36:16 Starting - Starting the training job...
2022-10-24 07:36:43 Starting - Preparing the instances for trainingProfilerReport-1666596976: InProgress
............
2022-10-24 07:38:39 Downloading - Downloading input data...
2022-10-24 07:38:59 Training - Downloading the training image......
2022-10-24 07:40:17 Training - Training image download completed. Training in progress....
2022-10-24 07:40:44 Uploading - Uploading generated training model
2022-10-24 07:40:44 Completed - Training job completed
..Training seconds: 143
Billable seconds: 143


In [3]:
#deploy ml model
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

-------!

In [4]:
#create csv serialiser to run accuracy on test dataset
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

#read test dataset
import io
import pandas as pd

s3 = boto3.resource('s3')
bucket_obj = s3.Bucket(bucket)


test_line = []
test_objs = bucket_obj.objects.filter(Prefix="web-marketing/processed/test")
for obj in test_objs:
    try:
        key = obj.key
        body = obj.get()['Body'].read()
        temp = pd.read_csv(io.BytesIO(body),header=None, encoding='utf8',sep=',')  
        test_line.append(temp)
    except:
        continue

test_df = pd.concat(test_line)

#predict results using deployed model
import numpy as np
def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

#drop the target variable in test_df and make prediction 
predictions = predict(test_df.drop(test_df.columns[0], axis=1).to_numpy(), xgb_predictor)


#calculate accuracy using sklearn library
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred=np.round(predictions)
y_true=test_df.iloc[:,0].values.tolist()
print('Accuracy score: ',accuracy_score(y_true, y_pred))
print('Confusion matrix: \n',confusion_matrix(y_true, y_pred))


Accuracy score:  0.8688688688688688
Confusion matrix: 
 [[407  65]
 [ 66 461]]
