Data Visualization

About Dataset

This is a transnational data set containing all transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.

Source of Data

🔗 UCI Machine Learning Repository

Cleaned Data Source

🔗 Kaggle

View whole source code

🔗 github link

Data Sample


                # Libraries
                !pip install mpld3
                import mpld3
                import numpy as np
                import pandas as pd
                import matplotlib.pyplot as plt
                import seaborn as sns  # Just for some data
                from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
                import plotnine
                from plotnine import *
                from plotnine import ggplot, geom_boxplot, aes, theme, element_text, element_blank, scale_y_log10, labs, element_rect, guides, scale_color_manual
                from plotnine.themes import theme_minimal
                from plotnine.geoms import geom_hline, geom_jitter
                from plotnine.scales import scale_fill_brewer
                from plotnine import facet_wrap
                from mizani.breaks import date_breaks
                import plotly.express as px # used for interactive visualizations
                import plotly.graph_objects as go
                from plotly.offline import plot
                from plotly.offline import init_notebook_mode, iplot # plot plotly graphs in line in a notebook
                import pycountry
                import calendar

                import warnings        
                warnings.filterwarnings("ignore") # ignores warnings

                data = pd.read_csv('online_retail_cleaned.csv')
                data.head(10)

Visualizing the Sales by Country (Pie Chart) using plotly


                # calculate total sales by country
                country_sales = pd.DataFrame(data.groupby("Country") \
                ["ItemTotal"].sum()).reset_index().rename({"ItemTotal":"TotalSales"},axis=1)
                fig = px.pie(country_sales,
                            values="TotalSales",
                            names="Country",
                            title="Percent of Total Sales by Country",
                            color_discrete_sequence=px.colors.qualitative.G10
                            )

                fig.update_traces(
                                textposition="inside",
                                textinfo="percent+label"
                                )
                fig.update_layout(
                                margin=dict(l=20, r=70, b=20, t=90, pad=0),
                                titlefont=dict(size=20),
                                width=800,  
                                height=600  
                                )

                # Save the animation as an HTML file
                plot(fig, filename='Percent of Total Sales by Country.html', auto_open=False)

                iplot(fig)

Major Contribution: We can see that the UK has a major contribution towards sales.

Now, We will plot this map without the UK sales since it would skew the scale due to its predominant share of sales.

Visualizing the Sales by Country (Bar plot) using plotnine


                    # also drop "Unspecified" and "European Community" since they cannot be mapped
                    country_sales = country_sales[country_sales["Country"] \
                    .isin(["Unspecified", "European Community","United Kingdom"]) == False]

                    # Create bar plot using Plotnine
                    bar_plot = (
                        ggplot(country_sales, aes(x='Country', y='TotalSales', fill='Country')) +
                        geom_bar(stat='identity') +
                        labs(title="Total Sales by Country") +
                        theme_minimal() +
                        theme(axis_text_x=element_text(angle=90, hjust=1),
                            legend_position='bottom',
                            figure_size=(10, 10)) 
                    )

                    bar_plot.save("bar_plot.jpg")

                    bar_plot.show()

Visualizing the Sales by Country (Map)using plotly


                    # rename countries into names which are recognised by plotly
                    country_sales["Country"] = \
                    country_sales["Country"].replace({"EIRE": "Ireland",
                                                      "Channel Islands": "United Kingdom",
                                                      "RSA": "South Africa"})

                    # https://stackoverflow.com/questions/59812824/plotly-express-plot-not-shown-in-jupyter-notebook
                    choro_data = dict(
                                    type = "choropleth",
                                    colorscale = "Agsunset",
                                    locations = country_sales["Country"],
                                    locationmode = "country names",
                                    z = country_sales["TotalSales"].astype(float).round(0).tolist(),
                                    hovertext = country_sales["Country"],
                                    hovertemplate = "%{hovertext}: £%{z:,.0f} ",
                                    # https://stackoverflow.com/questions/59057881/python-plotly-how-to-customize-hover-template-on-with-what-information-to-show
                                    colorbar = {"title" : "Total Sales", },
                                    ) 
                    
                    layout = dict(
                                title = "Total Sales By Country Excluding the UK",
                                titlefont = dict(size = 20),
                                geo = dict(showframe = False, bgcolor = "#BAEAED",),
                                margin={"r":0,"t":50,"l":10,"b":0}  
                                )
                    
                    fig = go.Figure(data=choro_data, layout=layout)
                    
                    # Save the animation as an HTML file
                    plot(fig, filename='total_sales_map.html', auto_open=False)
                    
                    # fig.update_traces(visible = True)
                    iplot(fig)

Overall sales trend (using plotnine)


                    # create columns extracting the year and month of InvoiceDates
                    data["year"], data["month"] = data["InvoiceDate"].dt.year, data["InvoiceDate"].dt.month

                    sales = data.groupby(["year","month"])["ItemTotal"].sum() \
                    .reset_index().rename({"ItemTotal":"TotalSales"},axis=1)

                    # create rows about April 2010 and November 2010 with averages from preceding and following month 
                                                                                                  
                    new_rows = pd.DataFrame({"year":[2010,2010],
                                            "month": [4,11],
                                            "TotalSales": [38123.21,184980.04]},
                                            index = [98,99]) # arbitrary indexes

                    # insert the row in the sales table
                    sales = pd.concat([new_rows, sales]) \
                    .sort_values(by=["year","month"]).reset_index(drop=True)

                    # convert numbers into month names
                    sales["month"] = sales["month"].apply(lambda x: calendar.month_abbr[x])

                    # combine month and year
                    sales["month"] = sales["month"].astype(str) + " " + sales["year"].astype(str)

                    # drop the redundant year column
                    sales = sales.drop("year", axis = 1) 

                    sales = sales[0:23] # drop December 2011 since the data does not cover the whole month

                    # Define the order of months as they appear in the DataFrame
                    month_order = sales['month'].tolist()

                    # Create line chart using Plotnine
                    line_chart = (
                        ggplot(sales, aes(x='month', y='TotalSales', group='factor(month.str[:0])')) +  # Group by year
                        geom_line(color='purple', size=1.5) +
                        geom_point(color='purple', size=3) +
                        labs(title="Total Sales by Month",
                            x="Month",
                            y="Total Sales (in £)") +
                        theme_minimal() +
                        theme(axis_text_x=element_text(angle=45, hjust=1)) +
                        scale_x_discrete(limits=month_order)  # Set the order of x-axis ticks
                    )

                    line_chart.save("line_plot.jpg")
                    line_chart.show()

Conclusion : We can see a clear upward📈 trend with initial sharp increses towards the end of 2010 and the beginning of 2011. Let's investigate whether the number of customers in each month exhibit a similar trend.

How Many Customers Purchased Products Each Month and How Many New Customers were There Each Month?


                    customers = data[data["CustomerID"].notna()].groupby(["year", "month"]) \
                    .agg({"CustomerID": "unique"}) \
                    .reset_index().rename({"CustomerID": "unique_customer_ids"}, axis = 1)
                    
                    # calculate the number of unique customers and insert it as a column
                    customers.insert(2,"unique_customers_this_month", customers["unique_customer_ids"].str.len())

                    ids = []

                    # creates a running list of customerids up to each month
                    for index, row in customers.iterrows(): 
                        if index == 0:
                            ids.append(row["unique_customer_ids"].tolist())
                            
                        else:   # adds the present ids to the accumulated list of previous ids
                            ids.append(row["unique_customer_ids"].tolist() + ids[index-1])

                    total_customers = []
                    for i in range(len(ids)):
                        total_customers.append(len(set(ids[i]))) # the set removes duplicates  
                        
                    # insert as a column
                    customers.insert(3, "total_customers", total_customers)

                    # add the first difference of total_customers

                    customers.insert(3, "new_customers_this_month", customers["total_customers"].diff() \
                    .replace({np.nan: 98}).astype(int)) # fill in the first value

                    # drop the long lists of unique customers
                    customers = customers.drop("unique_customer_ids", axis = 1)

                    # create rows about April 2010 and November 2010 with averages from preceding and following months
                    new_rows = \
                    pd.DataFrame({"year":[2010,2010],
                                "month": [4,11],
                                "unique_customers_this_month": [65,271],
                                "new_customers_this_month": [59,163],
                                "total_customers": [288,803]}, index = [98,99]) # arbitrary indexes

                    # insert the row in the customers table
                    customers = pd.concat([new_rows, customers]) \
                    .sort_values(by=["year","month"]).reset_index(drop=True)

                    # convert numbers into month names
                    customers["month"] = customers["month"].apply(lambda x: calendar.month_abbr[x])

                    # combine month and year
                    customers["month"] = customers["month"].astype(str) + " " + customers["year"].astype(str)

                    # drop the redundant year column
                    customers = customers.drop("year", axis = 1)

                    customers = customers[0:23] # drop December 2011 since the data does not cover the whole month

                    trace1 = go.Scatter(
                                        x = customers["month"],
                                        y = customers["unique_customers_this_month"],
                                        mode = "lines+markers",
                                        name = "Unique Customers This Month",
                                        line = dict(width = 4),
                                        marker = dict(
                                                    size = 10,
                                                    color = "#0E79B2"
                                                    ),
                                        hovertemplate = "%{x}
Unique Customers: %{y} ",
                                    )
                    trace2 = go.Scatter(
                                        x = customers["month"],
                                        y = customers["new_customers_this_month"],
                                        mode = "lines+markers",
                                        name = "New Customers This Month",
                                        line = dict(width = 4),
                                        marker = dict(
                                                    size = 10,
                                                    color = "rgba(242, 225, 39, 1)"
                                                    ),
                                        hovertemplate = "%{x}
New Customers: %{y} ",
                                    )
                    trace3 = go.Scatter(
                                        x = customers["month"],
                                        y = customers["total_customers"],
                                        mode = "lines+markers",
                                        name = "Total Customers",
                                        line = dict(width = 4), 
                                        marker = dict(
                                                    size = 10,
                                                    color = "rgba(242, 39, 127, 1)"
                                                    ),
                                        hovertemplate = "%{x}
Total Customers: %{y} ",
                                    )

                    line_data = [trace1, trace2, trace3]

                    layout = dict(
                                title = "Customers by Month",
                                titlefont = dict(size = 20),
                                margin=dict(l=10, r=50, b=10, t=70, pad=0),
                                xaxis= dict(title= "Month",ticklen = 5,zeroline = False),
                                yaxis= dict(title= "Number of Customers"),
                                width=900,  
                                height=600,
                                legend=dict(
                                            font = dict(size = 12),
                                            yanchor = "top",
                                            y=0.98,
                                            x= 0.01
                                            )
                                )
                    fig = dict(data = line_data, layout = layout)

                    # Save the animation as an HTML file
                    plot(fig, filename='total Customers each month.html', auto_open=False)

                    iplot(fig)


                    # Make a copy of customers DataFrame
                    customers_ = customers.copy()
                    
                    # Convert "month" column to datetime type with specified format
                    customers_['month'] = pd.to_datetime(customers_['month'], format='%b %Y')
                    
                    # Plot using Plotnine
                    (ggplot(customers_)
                     + aes(x='month')
                     + geom_line(aes(y='new_customers_this_month'), color='blue', size=1, show_legend=True)
                     + geom_point(aes(y='new_customers_this_month'), color='blue', size=2)
                     + geom_line(aes(y='unique_customers_this_month'), color='red', size=1, show_legend=True)
                     + geom_point(aes(y='unique_customers_this_month'), color='red', size=2)
                     + labs(x='Month', y='Customers', color='Plot')
                     + theme_bw()
                     + theme(axis_text_x=element_text(rotation=90, hjust=0.5))
                     + scale_x_datetime(date_breaks="1 month", date_labels="%b %Y")  
                     + scale_y_continuous(breaks=range(0, max(customers_['unique_customers_this_month'])+1, 100)) 
                     + scale_color_manual(values=["blue", "red"], labels=["New Customers", "Unique Customers"])
                    )

💡We can check that invoices either have missing values for all CustomerIDs, or no CustomerIDs are missing. Since we do not know how many customers have an NaN value for CustomerID, we drop records with missing CustomerIDs for this analysis.

Observations:

We can see a significant jump📈 in new customers around December 2010 and January 2011, likely related to the Christmas holidays.
We can see another spike📈 in customers in November 2011, although there is no spike in new customers. This might suggest that customers were satisfied with their Christmas purchases in 2010, and are now coming back to the online store for their Christmas 2011 purchases.
We do infact see that the line chart of the number of total customers has a concave shape. In other words, the number of total customers is increasing at a descreasing rate. As commented above, this is directly related to the decreasing number of new customers.

What Time During the Day Do Customers Make the Most Purchases?

Strategy 💡:

We have seen spikes in sales around December 2010, which is likely related to the Christmas holidays. Since the last date of this dataset is December 2011 (which we assume is the present), we expect that the business would be running marketing campaigns for this year's Christmas holidays as well. Therefore, it would be useful to find out what time of day customers make the most purchases in order to target the marketing campaigns around these times.
To check this, we can look at what time customers were making the most orders the previous December (i.e. December 2010), as well as the most common hours for purchases during the last 2 months (i.e. October and November 2011). Since we have data about 12 days of December 2011, we include these in the analysis as well.


                    # take data only about December 2010 and October, Novermber and December 2011
                    subset = data[
                                ((data["year"] == 2010) & (data["month"] == 12))
                                |    
                                ((data["year"] == 2011) & (data["month"] == 10))
                                |
                                ((data["year"] == 2011) & (data["month"] == 11))
                                |
                                ((data["year"] == 2011) & (data["month"] == 12))
                                ]
                    # extract the hour of purchase from InvoiceDate and add it as a column
                    subset["hour"] = subset["InvoiceDate"].astype(str).str[11:13].astype(int)
                     
                    # calculate the total number of orders for each hour of the day in these months
                    frequency = subset.groupby(["year","month","hour"]) \
                    .agg({"InvoiceNo":"nunique"}).reset_index() \
                    .rename({"InvoiceNo": "num_orders"}, axis = 1)

                    pivot = frequency.pivot(index = "hour", columns = ["year","month"], values = ["num_orders"])
                    pivot = pd.DataFrame(pivot.to_records()) # flattens multilevel column headings
                    pivot["hour"] = pivot["hour"].astype(str) + ":00" # make hours more readable
                    pivot = pivot.set_index("hour")
                    pivot.index.name = "" # remove index name for plotting
                    pivot.rename(columns={ # set more readable names
                                        pivot.columns[0]:"Dec 2010",
                                        pivot.columns[1]:"Oct 2011",
                                        pivot.columns[2]:"Nov 2011",
                                        pivot.columns[3]:"Dec 2011"
                                        }, inplace = True)
                    # Apply styling
                    styled_df = pivot.replace(np.nan, 0).style \
                        .background_gradient(cmap="rocket", subset=["Dec 2010"]) \
                        .background_gradient(cmap="rocket", subset=["Oct 2011"]) \
                        .background_gradient(cmap="rocket", subset=["Nov 2011"]) \
                        .background_gradient(cmap="rocket", subset=["Dec 2011"]) \
                        .format("{:.0f}")
                    
                    # Convert styled DataFrame to HTML and save to a file
                    styled_df.to_html("heatmap_table.html", escape=False)

                    print(styled_df)


                    # Transpose the DataFrame to have hours as rows and months as columns
                    pivot_transposed = pivot.T.reset_index()

                    # Rename the columns to match the required format
                    pivot_transposed.columns = ['month', '7:00', '8:00', '9:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00']

                    # Melt the DataFrame to long format
                    melted_df = pivot_transposed.melt(id_vars=['month'], var_name='hour', value_name='orders')

                    # Convert hour to categorical to maintain the order
                    melted_df['hour'] = pd.Categorical(melted_df['hour'], categories=['7:00', '8:00', '9:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00'])

                    # Plot the heatmap
                    heatmap = (
                        ggplot(melted_df, aes(x='hour', y='month', fill='orders'))
                        + geom_tile()
                        + scale_fill_gradient(low="black", high="white", limits=[30, 350], breaks=[30, 190, 350], labels=["low", "medium", "high"])
                        + theme(axis_text_x=element_text(angle=45, hjust=1),
                            figure_size=(12, 4))
                        + ggtitle("Total Number of Orders for Each Hour of the Day")
                    )

                    # Show the heatmap
                    print(heatmap)

Conclusion : We can see, that the most orders are placed around midday, more specifically at 12:00. Therefore, the marketing team could target their campaigns around these times to maximize conversions.

Used Faceting(using plotnine)

Which is the Best Selling Product in Each Country?


                # calculate total sales for each product in each country
                sales_countr_descr = data.groupby(["Country", "Description"]) \
                .agg({"ItemTotal": "sum"}) \
                .rename({"ItemTotal":"TotalSales"},axis=1).reset_index()

                # find the total sales of the best selling product in each country
                max_sales = pd.DataFrame(sales_countr_descr.groupby("Country") \
                ["TotalSales"].max().reset_index()) \
                .rename({"TotalSales":"Best_Product_Total_Sales"},axis=1)

                # join the two tables from above together
                joined_df = sales_countr_descr.merge(max_sales, on = "Country", how = "left")

                # in other words, find the best selling item in each country
                joined_df = joined_df[joined_df["TotalSales"] == joined_df["Best_Product_Total_Sales"]]
                joined_df.head()

                # drop the redundant column
                joined_df = joined_df.drop("Best_Product_Total_Sales", axis = 1)

                # total sales in each country
                country_sales = pd.DataFrame(data.groupby("Country")["ItemTotal"].sum()) \
                .reset_index().rename({"ItemTotal":"Country_Total_Sales"}, axis = 1)

                joined_df = joined_df.merge(country_sales, on = "Country", how = "inner") \
                .rename({"Description":"Best_Selling_Product"}, axis = 1)

                joined_df["%_of_Country_Sales"] = (joined_df["TotalSales"] / joined_df["Country_Total_Sales"])

                styled_df = joined_df.style.background_gradient(cmap=sns.light_palette("seagreen", as_cmap=True), \
                    subset=["%_of_Country_Sales"]) \
                    .format({"%_of_Country_Sales":"{:.2%}",
                            "TotalSales":"£{:,.0f}",
                            "Country_Total_Sales":"£{:,.0f}"})
                print(styled_df)

Which product is the most common best seller?


                    df_ = pd.DataFrame(joined_df["Best_Selling_Product"].value_counts()) \
                    .rename({"Best_Selling_Product":"Best_Selling_Product_in_X_Countries"},axis=1)
                    print(df_)

Which are the Most Successful Products Overall?

💡A characteristic we could use to answer this is: which products of relatively high price have sold relatively high quantities. This can be easily visualized with a scatter plot of all products with total quantity sold and average unitprice on the x and y axes. Products' markers to the north-east on the scatter plot suggest relatively high quantities sold at a relatively high price.


                    # find total quantity sold and average unit price for all products
                    products = data.groupby("Description") \
                    .agg({"Quantity":"sum", "UnitPrice":"mean"}).reset_index()

                    products = products[products["Description"].isin(["DOTCOM","Manual","Discount"]) == False] # remove for plotting

                    trace = go.Scatter(
                                        x = products["Quantity"],
                                        y = products["UnitPrice"],
                                        customdata = products["Description"],
                                        hovertemplate = "%{customdata}
Quantity Sold: %{x}
UnitPrice: £%{y:.2f}
",
                                        mode = "markers",
                                        name = "Products",
                                        line = dict(width = 4),
                                        marker = dict(
                                                    size = 10,
                                                    color = "#1199cf"
                                                    )
                                        )

                    scatter_data = [trace]

                    layout = dict(
                                title = "Products",
                                titlefont = dict(size = 20),
                                margin=dict(l=10, r=50, b=10, t=70, pad=0),
                                xaxis= dict(title= "Quantity Sold",ticklen = 5,zeroline = False),
                                width=1000,  
                                height=600,
                                yaxis= dict(title= "Unit Price")
                                )
                    fig = dict(data = scatter_data, layout = layout)

                    iplot(fig)


                    # Create a new DataFrame for tooltips
                    tooltip_df = products[['Description', 'Quantity', 'UnitPrice']]

                    # Define the plot
                    scatter_plot = (
                        ggplot(products, aes(x='Quantity', y='UnitPrice')) +
                        geom_point(size=10, color="#1199cf") +
                        labs(title="Products", x="Quantity Sold", y="Unit Price") +
                        theme_minimal() +
                        theme(
                            plot_title=element_text(size=20),
                            axis_text_x=element_text(size=10),
                            axis_text_y=element_text(size=10)
                        )
                    )

                    print(scatter_plot)

Conclusion : Above we saw "REGENCY CAKESTAND 3 TIER" as the most common best seller (8 countries) and here we see it as a high selling product as well - its marker is visually separated from the other products on the plot.

Which Customers Contributed the Most to Total Sales?


                    cust_purchases = \
                    data.groupby("CustomerID") \
                    .agg({"InvoiceNo": "nunique", "ItemTotal": "sum"}) \
                    .rename({"ItemTotal":"TotalPurchases","InvoiceNo":"Number_of_Orders"},axis = 1) \
                    .sort_values(by = "TotalPurchases", ascending = False)

                    cust_purchases.head(15) # only first 15

How much did each customer contribute to the total sales figure in percentage terms?


                    cust_purchases["Percent_of_TotalSales"] = (cust_purchases["TotalPurchases"]/data["ItemTotal"].sum()).map("{:.2%}".format)
                    cust_purchases["TotalPurchases"] = cust_purchases["TotalPurchases"].map("£{:,.0f}".format) # format as currency

                    cust_purchases.head(15) # first 15 only

We saw that the sales were increasing at relatively larger rate during october 2010 to january 2011 , Now we'll see that which product has made larger contribution during this interval


                    # Step 1: Filter the dataframe
                    filtered_data = data[(data['year'] == 2010) & (data['month'] >= 10) | (data['year'] == 2011) & (data['month'] <= 1)]
                    # find total quantity sold and average unit price for all products
                    products1 = filtered_data.groupby("Description") \
                    .agg({"Quantity":"sum", "UnitPrice":"mean"}).reset_index()

                    products1 = products1[products1["Description"].isin(["DOTCOM","Manual","Discount"]) == False] # remove for plotting
                    trace = go.Scatter(
                    x = products1["Quantity"],
                    y = products1["UnitPrice"],
                    customdata = products1["Description"],
                    hovertemplate = "%{customdata}
Quantity Sold: %{x}
UnitPrice: £%{y:.2f}
",
                    mode = "markers",
                    name = "Products",
                    line = dict(width = 4),
                    marker = dict(
                                  size = 10,
                                  color = "#1199cf"
                                 )
                    )

                    scatter_data = [trace]

                    layout = dict(
                                title = "Products",
                                titlefont = dict(size = 20),
                                margin=dict(l=10, r=50, b=10, t=70, pad=0),
                                xaxis= dict(title= "Quantity Sold",ticklen = 5,zeroline = False),
                                width=1000,  
                                height=600,
                                yaxis= dict(title= "Unit Price")
                                )
                    fig = dict(data = scatter_data, layout = layout)

                    # Save the animation as an HTML file
                    plot(fig, filename='most successful product overall2', auto_open=False)

                    iplot(fig)


                    # Create a new DataFrame for tooltips
                    tooltip_df = products1[['Description', 'Quantity', 'UnitPrice']]

                    # Define the plot
                    scatter_plot = (
                        ggplot(products1, aes(x='Quantity', y='UnitPrice')) +
                        geom_point(size=2, color="#1199cf") +
                        labs(title="Products", x="Quantity Sold", y="Unit Price") +
                        theme_minimal() +
                        theme(
                            plot_title=element_text(size=20),
                            axis_text_x=element_text(size=10),
                            axis_text_y=element_text(size=10)
                        )
                    )

                    scatter_plot

Conclusion : We can see that the same product RAGENCY CAKESTAND 3 TIER has again contributed the most. So, it's not like any Christmas-specific changed behavior on that single product.

As we know that in february month there is valentines day. So, let's see if we can find any product which is more specific to this


                    # Step 1: Filter the dataframe
                    filtered_data = data[(data['month'] == 2) & data['Quantity']>0]
                    # find total quantity sold and average unit price for all products
                    products2 = filtered_data.groupby("Description") \
                    .agg({"Quantity":"sum", "UnitPrice":"mean"}).reset_index()

                    products2 = products2[products2["Description"].isin(["DOTCOM","Manual","Discount"]) == False] # remove for plotting
                    trace = go.Scatter(
                                        x = products2["Quantity"],
                                        y = products2["UnitPrice"],
                                        customdata = products2["Description"],
                                        hovertemplate = "%{customdata}
Quantity Sold: %{x}
UnitPrice: £%{y:.2f}
",
                                        mode = "markers",
                                        name = "Products",
                                        line = dict(width = 4),
                                        marker = dict(
                                                    size = 10,
                                                    color = "#1199cf"
                                                    )
                                        )

                    scatter_data = [trace]

                    layout = dict(
                                title = "Products",
                                titlefont = dict(size = 20),
                                margin=dict(l=10, r=50, b=10, t=70, pad=0),
                                xaxis= dict(title= "Quantity Sold",ticklen = 5,zeroline = False),
                                width=1000,  
                                height=600,
                                yaxis= dict(title= "Unit Price")
                                )
                    fig = dict(data = scatter_data, layout = layout)

                    # Save the animation as an HTML file
                    plot(fig, filename='most successful product overall3', auto_open=False)

                    iplot(fig)


                    # Create a new DataFrame for tooltips
                    tooltip_df = products2[['Description', 'Quantity', 'UnitPrice']]

                    # Define the plot
                    scatter_plot = (
                        ggplot(products2, aes(x='Quantity', y='UnitPrice')) +
                        geom_point(size=2, color="#1199cf") +
                        labs(title="Products", x="Quantity Sold", y="Unit Price") +
                        theme_minimal() +
                        theme(
                            plot_title=element_text(size=20),
                            axis_text_x=element_text(size=10),
                            axis_text_y=element_text(size=10)
                        )
                    )

                    scatter_plot

Conclusion : We can see that the products such as WRAP ENGLISH ROSE, GINGHAM ROSE WRAP, WRAP PINK FAIRY CAKES, SET OF 3 CAKE TINS PANTRY DESIGN are relatively sold in higher quantity. So, during this month, the marketing team can target these types of customers to sell products.

Created by Harsh Raj

<--------------------------------------------------------------------The End------------------------------------------------------------------->