%reload_ext autoreload
%autoreload 2

from fastai.basics import *
import pandas as pd
import matplotlib.pyplot as plt

!curl -s https://course.fast.ai/setup/colab | bash

Updating fastai...
Done.

Data preparation / Feature engineering

In addition to the provided data, we will be using external datasets put together by participants in the Kaggle competition. You can download all of them here. Then you shold untar them in the dirctory to which PATH is pointing below.

For completeness, the implementation used to put them together is included below.

untar_data('http://files.fast.ai/part2/lesson14/rossmann',dest='/content/data/rossmann')

PosixPath('/content/data/rossmann/rossmann')

(Config().data_path()/'rossmann').ls()

[PosixPath('/root/.fastai/data/rossmann/googletrend.csv'),
 PosixPath('/root/.fastai/data/rossmann/weather.csv'),
 PosixPath('/root/.fastai/data/rossmann/store_states.csv'),
 PosixPath('/root/.fastai/data/rossmann/state_names.csv'),
 PosixPath('/root/.fastai/data/rossmann/sample_submission.csv'),
 PosixPath('/root/.fastai/data/rossmann/train.csv'),
 PosixPath('/root/.fastai/data/rossmann/store.csv'),
 PosixPath('/root/.fastai/data/rossmann/test.csv')]

(Config().data_path()/'rossmann')

PosixPath('/root/.fastai/data/rossmann')

PATH=(Config().data_path()/'rossmann')

table_names = ['train', 'store', 'store_states', 'state_names', 'googletrend', 'weather', 'test']
type(table_names)

list

??pd.read_csv

tables = [pd.read_csv(PATH/f'{fname}.csv', low_memory=False) for fname in table_names]
train, store, store_states, state_names, googletrend, weather, test = tables
len(train),len(test)

(1017209, 41088)

??train.isnull()

Define functions for Data Exploration

def data_shape_and_head(df):
    pd.set_option('float_format', '{:f}'.format)
    print(f"Data Frame Shape: {df.shape}")
    return df.head()

def percentage_of_null_data(df):
    pd.options.mode.use_inf_as_na=True
    total = df.isnull().sum()
    percent = (df.isnull().sum()/df.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in df.columns:
        dtype = str(df[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

def unique_values_eachcolumn(df):
  for column in df.columns.values:
    print(f"[df] Unique values of '{column}' : {df[column].nunique()}")

def plot_col_count_4_top20(column_name, title, df, size=6):
  # Displays the count of records for each value of the column. 
  # Displays data for first 20 most frequent values
    import seaborn as sns
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[column_name], order = df[column_name].value_counts().index[:30], palette='Dark2')
    g.set_title("Number and Percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

def plot_most_or_least_populated_data(df,most=True):
  import seaborn as sns
  total = df.isnull().count() - df.isnull().sum()
  percent = 100 - (df.isnull().sum()/df.isnull().count()*100)
  tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
  tt = pd.DataFrame(tt.reset_index())
  tt= (tt.sort_values(['Total'], ascending=most))
  plt.figure(figsize=(10, 8))
  sns.set(style='darkgrid')
  ax = sns.barplot(x='Percent', y='index', data=tt.head(30), color='DarkOrange')
  plt.title(('Most' if most else 'Least' ) + ' frequent columns/features in the dataframe')
  plt.ylabel('Features/Columns')
  plt.show()

Do the data exploration for train Dataframe

data_shape_and_head(train)

Data Frame Shape: (1017209, 9)

train.describe()

train.describe().T

train.count()

Store            1017209
DayOfWeek        1017209
Date             1017209
Sales            1017209
Customers        1017209
Open             1017209
Promo            1017209
StateHoliday     1017209
SchoolHoliday    1017209
dtype: int64

train['DayOfWeek'].count()

1017209

unique_values_eachcolumn(train)

[df] Unique values of 'Store' : 1115
[df] Unique values of 'DayOfWeek' : 7
[df] Unique values of 'Date' : 942
[df] Unique values of 'Sales' : 21734
[df] Unique values of 'Customers' : 4086
[df] Unique values of 'Open' : 2
[df] Unique values of 'Promo' : 2
[df] Unique values of 'StateHoliday' : 4
[df] Unique values of 'SchoolHoliday' : 2

plot_col_count_4_top20('Date', 'Counts', train,size=6)

plot_most_or_least_populated_data(train)

train.DayOfWeek.nunique()

7

train.StateHoliday.unique()

array(['0', 'a', 'b', 'c'], dtype=object)

percentage_of_null_data(train)

train.Date

0          2015-07-31
1          2015-07-31
2          2015-07-31
3          2015-07-31
4          2015-07-31
              ...    
1017204    2013-01-01
1017205    2013-01-01
1017206    2013-01-01
1017207    2013-01-01
1017208    2013-01-01
Name: Date, Length: 1017209, dtype: object

train.groupby('Date').max().Sales

Date
2013-01-01    17267
2013-01-02    25357
2013-01-03    23303
2013-01-04    21996
2013-01-05    22521
              ...  
2015-07-27    27881
2015-07-28    25518
2015-07-29    25840
2015-07-30    24395
2015-07-31    27508
Name: Sales, Length: 942, dtype: int64

train.sample(5, random_state=300).groupby('Date').max().Customers

Date
2013-07-10    440
2014-05-09    540
2014-11-02      0
2015-04-06      0
2015-04-20    482
Name: Customers, dtype: int64

train.sample(20, random_state=5).groupby('Date').max().Sales.plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x7f85651a9828>

train.groupby('DayOfWeek').DayOfWeek

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f85650f2198>

(train.groupby('DayOfWeek').sum())

train[0:10]

We looked at the train data. Similarly we can look at other tabular data as well and figure out the features that we can use.

We turn state Holidays to booleans, to make them more convenient for modeling. We can do calculations on pandas fields using notation very similar (often identical) to numpy.

train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'

join_df is a function for joining tables on specific fields. By default, we'll be doing a left outer join of right on the left argument using the given fields for each table.

Pandas does joins using the merge method. The suffixes argument describes the naming convention for duplicate fields. We've elected to leave the duplicate field names on the left untouched, and append a "_y" to those on the right.

??pd.merge()

def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", suffix))

Join weather/state names.

weather = join_df(weather, state_names, "file", "StateName")

weather

In pandas you can add new columns to a dataframe by simply defining it. We'll do this for googletrends by extracting dates and state names from the given data and adding those columns.

We're also going to replace all instances of state name 'NI' to match the usage in the rest of the data: 'HB,NI'. This is a good opportunity to highlight pandas indexing. We can use .loc[rows, cols] to select a list of rows and a list of columns from the dataframe. In this case, we're selecting rows w/ statename 'NI' by using a boolean list googletrend.State=='NI' and selecting "State".

googletrend.index[5]

5

googletrend.week.str.split(' - ', expand=True)[1]

0       2012-12-08
1       2012-12-15
2       2012-12-22
3       2012-12-29
4       2013-01-05
           ...    
2067    2015-09-05
2068    2015-09-12
2069    2015-09-19
2070    2015-09-26
2071    2015-10-03
Name: 1, Length: 2072, dtype: object

googletrend

googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]

googletrend.loc[googletrend.State=='NI', "State"]

Series([], Name: State, dtype: object)

googletrend.loc[googletrend.State=='NI', "State"] = 'HB,NI'

googletrend

The following extracts particular date fields from a complete datetime for the purpose of constructing categoricals.

You should always consider this feature extraction step when working with date-time. Without expanding your date-time into these additional fields, you can't capture any trend/cyclical behavior as a function of time at any of these granularities. We'll add to every table with a date field.

def add_datepart(df, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

add_datepart(weather, "Date", drop=False)
add_datepart(googletrend, "Date", drop=False)
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)

The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.

trend_de = googletrend[googletrend.file == 'Rossmann_DE']

Now we can outer join all of our data into a single dataframe. Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value on the right table, the corresponding row in the new table has Null values for all right table fields. One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

Aside: Why not just do an inner join? If you are assuming that all records are complete and match on the field you desire, an inner join will do the same thing as an outer join. However, in the event you are wrong or a mistake is made, an outer join followed by a null-check will catch it. (Comparing before/after # of rows for inner join is equivalent, but requires keeping track of before/after row #'s. Outer join is easier.)

store = join_df(store, store_states, "Store")
len(store[store.State.isnull()])

0

??join_df

joined = join_df(train, store, "Store")
joined_test = join_df(test, store, "Store")
len(joined[joined.StoreType.isnull()]),len(joined_test[joined_test.StoreType.isnull()])

(0, 0)

joined = join_df(joined, googletrend, ["State","Year", "Week"])
joined_test = join_df(joined_test, googletrend, ["State","Year", "Week"])
len(joined[joined.trend.isnull()]),len(joined_test[joined_test.trend.isnull()])

(0, 0)

joined = joined.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
joined_test = joined_test.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])

(0, 0)

joined = join_df(joined, weather, ["State","Date"])
joined_test = join_df(joined_test, weather, ["State","Date"])
len(joined[joined.Mean_TemperatureC.isnull()]),len(joined_test[joined_test.Mean_TemperatureC.isnull()])

(0, 0)

joined.describe()

for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
          if c in df.columns:
            print(c)

Date_y
Month_y
Day_y
Dayofweek_y
Dayofyear_y
Is_month_end_y
Is_month_start_y
Is_quarter_end_y
Is_quarter_start_y
Is_year_end_y
Is_year_start_y
Elapsed_y
file_y
Year_y
Month_y
Week_y
Day_y
Dayofweek_y
Dayofyear_y
Is_month_end_y
Is_month_start_y
Is_quarter_end_y
Is_quarter_start_y
Is_year_end_y
Is_year_start_y
Elapsed_y
Date_y
Month_y
Day_y
Dayofweek_y
Dayofyear_y
Is_month_end_y
Is_month_start_y
Is_quarter_end_y
Is_quarter_start_y
Is_year_end_y
Is_year_start_y
Elapsed_y
file_y
Year_y
Month_y
Week_y
Day_y
Dayofweek_y
Dayofyear_y
Is_month_end_y
Is_month_start_y
Is_quarter_end_y
Is_quarter_start_y
Is_year_end_y
Is_year_start_y
Elapsed_y

for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns: df.drop(c, inplace=True, axis=1)

Next we'll fill in missing values to avoid complications with NA's. NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present, so it's always important to think about how to deal with them. In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

joined.shape

(1017209, 74)

for column in joined.columns:
  print(column)

Store
DayOfWeek
Date
Sales
Customers
Open
Promo
StateHoliday
SchoolHoliday
Year
Month
Week
Day
Dayofweek
Dayofyear
Is_month_end
Is_month_start
Is_quarter_end
Is_quarter_start
Is_year_end
Is_year_start
Elapsed
StoreType
Assortment
CompetitionDistance
CompetitionOpenSinceMonth
CompetitionOpenSinceYear
Promo2
Promo2SinceWeek
Promo2SinceYear
PromoInterval
State
file
week
trend
file_DE
week_DE
trend_DE
Date_DE
State_DE
Month_DE
Day_DE
Dayofweek_DE
Dayofyear_DE
Is_month_end_DE
Is_month_start_DE
Is_quarter_end_DE
Is_quarter_start_DE
Is_year_end_DE
Is_year_start_DE
Elapsed_DE
Max_TemperatureC
Mean_TemperatureC
Min_TemperatureC
Dew_PointC
MeanDew_PointC
Min_DewpointC
Max_Humidity
Mean_Humidity
Min_Humidity
Max_Sea_Level_PressurehPa
Mean_Sea_Level_PressurehPa
Min_Sea_Level_PressurehPa
Max_VisibilityKm
Mean_VisibilityKm
Min_VisibilitykM
Max_Wind_SpeedKm_h
Mean_Wind_SpeedKm_h
Max_Gust_SpeedKm_h
Precipitationmm
CloudCover
Events
WindDirDegrees
StateName

len(joined.loc[joined.StateName.isna() , "StateName"])

0

for df in (joined,joined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

Next we'll extract features "CompetitionOpenSince" and "CompetitionDaysOpen". Note the use of apply() in mapping a function across dataframe values.

for df in (joined,joined_test):
    df["CompetitionOpenSince"] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear, 
                                                     month=df.CompetitionOpenSinceMonth, day=15))
    df["CompetitionDaysOpen"] = df.Date.subtract(df.CompetitionOpenSince).dt.days

We'll replace some erroneous / outlying data.

for df in (joined,joined_test):
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

We add "CompetitionMonthsOpen" field, limiting the maximum to 2 years to limit number of unique categories.

for df in (joined,joined_test):
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"]//30
    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24
joined.CompetitionMonthsOpen.unique()

array([24,  3, 19,  9,  0, 16, 17,  7, 15, 22, 11, 13,  2, 23, 12,  4, 10,  1, 14, 20,  8, 18,  6, 21,  5])

Same process for Promo dates. You may need to install the isoweek package first.

 ! pip install isoweek

Collecting isoweek
  Downloading https://files.pythonhosted.org/packages/c2/d4/fe7e2637975c476734fcbf53776e650a29680194eb0dd21dbdc020ca92de/isoweek-1.3.3-py2.py3-none-any.whl
Installing collected packages: isoweek
Successfully installed isoweek-1.3.3

from isoweek import Week
for df in (joined,joined_test):
    df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(
        x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1))
    df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

joined

for df in (joined,joined_test):
    df.loc[df.Promo2Days<0, "Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
    df["Promo2Weeks"] = df["Promo2Days"]//7
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25
    df.Promo2Weeks.unique()

  df.Promo2Weeks.unique()

array([ 0, 25, 24, 15, 20, 23, 14, 19, 22, 13, 18, 21, 12, 17, 11, 16, 10,  9,  8])

joined.to_pickle(PATH/'joined')
joined_test.to_pickle(PATH/'joined_test')

Durations

It is common when working with time series data to extract data that explains relationships across rows as opposed to columns, e.g.:

Running averages
Time until next event
Time since last event

This is often difficult to do with most table manipulation frameworks, since they are designed to work with relationships across columns. As such, we've created a class to handle this type of data.

We'll define a function get_elapsed for cumulative counting across a sorted dataframe. Given a particular field fld to monitor, this function will start tracking time since the last occurrence of that field. When the field is seen again, the counter is set to zero.

Upon initialization, this will result in datetime na's until the field is encountered. This is reset every time a new store is seen. We'll see how to use this shortly.

def get_elapsed(fld, pre):
    day1 = np.timedelta64(1, 'D')
    last_date = np.datetime64()
    last_store = 0
    res = []

    for s,v,d in zip(df.Store.values,df[fld].values, df.Date.values):
        if s != last_store:
            last_date = np.datetime64()
            last_store = s
        if v: last_date = d
        res.append(((d-last_date).astype('timedelta64[D]') / day1))
    df[pre+fld] = res

We'll be applying this to a subset of columns:

columns = ["Date", "Store", "Promo", "StateHoliday", "SchoolHoliday"]

df = train[columns].append(test[columns])

Let's walk through an example.

Say we're looking at School Holiday. We'll first sort by Store, then Date, and then call add_elapsed('SchoolHoliday', 'After'): This will apply to each row with School Holiday:

A applied to every row of the dataframe in order of store and date
Will add to the dataframe the days since seeing a School Holiday
If we sort in the other direction, this will count the days until another holiday.

??df.sort_values()

fld = 'SchoolHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

df

We'll do this for two more fields.

fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

fld = 'Promo'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

We're going to set the active index to Date.

df = df.set_index("Date")

Then set null values from elapsed field calculations to 0.

columns = ['SchoolHoliday', 'StateHoliday', 'Promo']

for o in ['Before', 'After']:
    for p in columns:
        a = o+p
        df[a] = df[a].fillna(0).astype(int)

Next we'll demonstrate window functions in pandas to calculate rolling quantities.

Here we're sorting by date (sort_index()) and counting the number of events of interest (sum()) defined in columns in the following week (rolling()), grouped by Store (groupby()). We do the same in the opposite direction.

bwd = df[['Store']+columns].sort_index().groupby("Store").rolling(7, min_periods=1).sum()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-158-b3a2d175a68c> in <module>()
----> 1 bwd = df[['Store']+columns].sort_index().groupby("Store").rolling(7, min_periods=1).sum()

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in sort_index(self, axis, level, ascending, inplace, kind, na_position, sort_remaining, by)
   5088 
   5089             indexer = nargsort(
-> 5090                 labels, kind=kind, ascending=ascending, na_position=na_position
   5091             )
   5092 

/usr/local/lib/python3.6/dist-packages/pandas/core/sorting.py in nargsort(items, kind, ascending, na_position)
    244 
    245     items = extract_array(items)
--> 246     mask = np.asarray(isna(items))
    247 
    248     if is_extension_array_dtype(items):

/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/missing.py in isna(obj)
    120     Name: 1, dtype: bool
    121     """
--> 122     return _isna(obj)
    123 
    124 

/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/missing.py in _isna_old(obj)
    177         return _isna_ndarraylike_old(obj)
    178     elif isinstance(obj, ABCGeneric):
--> 179         return obj._constructor(obj._data.isna(func=_isna_old))
    180     elif isinstance(obj, list):
    181         return _isna_ndarraylike_old(np.asarray(obj, dtype=object))

AttributeError: 'DatetimeArray' object has no attribute '_constructor'

fwd = df[['Store']+columns].sort_index(ascending=False
                                      ).groupby("Store").rolling(7, min_periods=1).sum()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-159-05ebca394bc4> in <module>()
----> 1 fwd = df[['Store']+columns].sort_index(ascending=False
      2                                       ).groupby("Store").rolling(7, min_periods=1).sum()

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in sort_index(self, axis, level, ascending, inplace, kind, na_position, sort_remaining, by)
   5088 
   5089             indexer = nargsort(
-> 5090                 labels, kind=kind, ascending=ascending, na_position=na_position
   5091             )
   5092 

/usr/local/lib/python3.6/dist-packages/pandas/core/sorting.py in nargsort(items, kind, ascending, na_position)
    244 
    245     items = extract_array(items)
--> 246     mask = np.asarray(isna(items))
    247 
    248     if is_extension_array_dtype(items):

/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/missing.py in isna(obj)
    120     Name: 1, dtype: bool
    121     """
--> 122     return _isna(obj)
    123 
    124 

/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/missing.py in _isna_old(obj)
    177         return _isna_ndarraylike_old(obj)
    178     elif isinstance(obj, ABCGeneric):
--> 179         return obj._constructor(obj._data.isna(func=_isna_old))
    180     elif isinstance(obj, list):
    181         return _isna_ndarraylike_old(np.asarray(obj, dtype=object))

AttributeError: 'DatetimeArray' object has no attribute '_constructor'

Next we want to drop the Store indices grouped together in the window function.

Often in pandas, there is an option to do this in place. This is time and memory efficient when working with large datasets.

bwd.drop('Store',1,inplace=True)
bwd.reset_index(inplace=True)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-160-5283e630275b> in <module>()
----> 1 bwd.drop('Store',1,inplace=True)
      2 bwd.reset_index(inplace=True)

NameError: name 'bwd' is not defined

fwd.drop('Store',1,inplace=True)
fwd.reset_index(inplace=True)

df.reset_index(inplace=True)

Now we'll merge these values onto the df.

df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])

df.drop(columns,1,inplace=True)

df.head()

It's usually a good idea to back up large tables of extracted / wrangled features before you join them onto another one, that way you can go back to it easily if you need to make changes to it.

df.to_pickle(PATH/'df')

df["Date"] = pd.to_datetime(df.Date)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-155-ccbc6b250f10> in <module>()
----> 1 df["Date"] = pd.to_datetime(df.Date)

/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'Date'

df.columns

Index(['Date', 'Store', 'AfterSchoolHoliday', 'BeforeSchoolHoliday',
       'AfterStateHoliday', 'BeforeStateHoliday', 'AfterPromo', 'BeforePromo',
       'SchoolHoliday_bw', 'StateHoliday_bw', 'Promo_bw', 'SchoolHoliday_fw',
       'StateHoliday_fw', 'Promo_fw'],
      dtype='object')

joined = pd.read_pickle(PATH/'joined')
joined_test = pd.read_pickle(PATH/f'joined_test')

joined = join_df(joined, df, ['Store', 'Date'])

joined_test = join_df(joined_test, df, ['Store', 'Date'])

The authors also removed all instances where the store had zero sale / was closed. We speculate that this may have cost them a higher standing in the competition. One reason this may be the case is that a little exploratory data analysis reveals that there are often periods where stores are closed, typically for refurbishment. Before and after these periods, there are naturally spikes in sales that one might expect. By ommitting this data from their training, the authors gave up the ability to leverage information about these periods to predict this otherwise volatile behavior.

joined = joined[joined.Sales!=0]

We'll back this up as well.

joined.reset_index(inplace=True)
joined_test.reset_index(inplace=True)

joined.to_pickle(PATH/'train_clean')
joined_test.to_pickle(PATH/'test_clean')

	Store	DayOfWeek	Sales	Customers	Open	Promo	SchoolHoliday
count	1017209.000000	1017209.000000	1017209.000000	1017209.000000	1017209.000000	1017209.000000	1017209.000000
mean	558.429727	3.998341	5773.818972	633.145946	0.830107	0.381515	0.178647
std	321.908651	1.997391	3849.926175	464.411734	0.375539	0.485759	0.383056
min	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	280.000000	2.000000	3727.000000	405.000000	1.000000	0.000000	0.000000
50%	558.000000	4.000000	5744.000000	609.000000	1.000000	0.000000	0.000000
75%	838.000000	6.000000	7856.000000	837.000000	1.000000	1.000000	0.000000
max	1115.000000	7.000000	41551.000000	7388.000000	1.000000	1.000000	1.000000

	count	mean	std	min	25%	50%	75%	max
Store	1017209.000000	558.429727	321.908651	1.000000	280.000000	558.000000	838.000000	1115.000000
DayOfWeek	1017209.000000	3.998341	1.997391	1.000000	2.000000	4.000000	6.000000	7.000000
Sales	1017209.000000	5773.818972	3849.926175	0.000000	3727.000000	5744.000000	7856.000000	41551.000000
Customers	1017209.000000	633.145946	464.411734	0.000000	405.000000	609.000000	837.000000	7388.000000
Open	1017209.000000	0.830107	0.375539	0.000000	1.000000	1.000000	1.000000	1.000000
Promo	1017209.000000	0.381515	0.485759	0.000000	0.000000	0.000000	1.000000	1.000000
SchoolHoliday	1017209.000000	0.178647	0.383056	0.000000	0.000000	0.000000	0.000000	1.000000

	Store	Sales	Customers	Open	Promo	SchoolHoliday
DayOfWeek
1	80821168	1130203012	117675012	137560	77760	34060
2	81344288	1020411930	110848063	143961	77580	36595
3	81345276	954962863	105117642	141936	77580	34636
4	81443338	911177709	101732938	134644	77580	34747
5	81443338	980555941	108384820	138640	77580	36235
6	80821168	846317735	95103854	144058	0	2724
7	80821168	29551433	5179426	3593	0	2724

	file	Date	Max_TemperatureC	Mean_TemperatureC	Min_TemperatureC	Dew_PointC	MeanDew_PointC	Min_DewpointC	Max_Humidity	Mean_Humidity	Min_Humidity	Max_Sea_Level_PressurehPa	Mean_Sea_Level_PressurehPa	Min_Sea_Level_PressurehPa	Max_VisibilityKm	Mean_VisibilityKm	Min_VisibilitykM	Max_Wind_SpeedKm_h	Mean_Wind_SpeedKm_h	Max_Gust_SpeedKm_h	Precipitationmm	CloudCover	Events	WindDirDegrees	StateName	State
0	NordrheinWestfalen	2013-01-01	8	4	2	7	5	1	94	87	65	1013	1005	1001	31.000000	12.000000	4.000000	39	26	58.000000	5.080000	6.000000	Rain	215	NordrheinWestfalen	NW
1	NordrheinWestfalen	2013-01-02	7	4	1	5	3	2	93	85	78	1028	1022	1014	31.000000	14.000000	10.000000	24	16	nan	0.000000	6.000000	Rain	225	NordrheinWestfalen	NW
2	NordrheinWestfalen	2013-01-03	11	8	6	10	8	4	100	93	77	1035	1030	1026	31.000000	8.000000	2.000000	26	21	nan	1.020000	7.000000	Rain	240	NordrheinWestfalen	NW
3	NordrheinWestfalen	2013-01-04	9	9	8	9	9	8	100	94	87	1036	1035	1034	11.000000	5.000000	2.000000	23	14	nan	0.250000	7.000000	Rain	263	NordrheinWestfalen	NW
4	NordrheinWestfalen	2013-01-05	8	8	7	8	7	6	100	94	87	1035	1034	1033	10.000000	6.000000	3.000000	16	10	nan	0.000000	7.000000	Rain	268	NordrheinWestfalen	NW
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
15835	Saarland	2015-09-13	21	17	12	16	14	12	100	88	68	1015	1010	1005	31.000000	15.000000	10.000000	27	10	50.000000	1.020000	7.000000	Rain	113	Saarland	SL
15836	Saarland	2015-09-14	18	14	11	15	12	7	99	85	61	1009	1005	1004	31.000000	13.000000	4.000000	32	16	53.000000	9.910000	5.000000	Rain-Thunderstorm	213	Saarland	SL
15837	Saarland	2015-09-15	16	12	9	11	8	7	93	77	62	1010	1008	1004	31.000000	12.000000	10.000000	34	14	nan	0.000000	5.000000	Rain	193	Saarland	SL
15838	Saarland	2015-09-16	19	15	11	16	13	10	97	90	75	1004	999	995	31.000000	10.000000	4.000000	32	14	45.000000	20.070000	7.000000	Rain-Thunderstorm	147	Saarland	SL
15839	Saarland	2015-09-17	14	13	12	14	12	10	99	92	82	1013	1005	999	31.000000	14.000000	8.000000	27	16	47.000000	6.100000	6.000000	Rain	202	Saarland	SL

	file	week	trend	Date	State
0	Rossmann_DE_SN	2012-12-02 - 2012-12-08	96	2012-12-02	SN
1	Rossmann_DE_SN	2012-12-09 - 2012-12-15	95	2012-12-09	SN
2	Rossmann_DE_SN	2012-12-16 - 2012-12-22	91	2012-12-16	SN
3	Rossmann_DE_SN	2012-12-23 - 2012-12-29	48	2012-12-23	SN
4	Rossmann_DE_SN	2012-12-30 - 2013-01-05	67	2012-12-30	SN
...	...	...	...	...	...
2067	Rossmann_DE_SL	2015-08-30 - 2015-09-05	95	2015-08-30	SL
2068	Rossmann_DE_SL	2015-09-06 - 2015-09-12	47	2015-09-06	SL
2069	Rossmann_DE_SL	2015-09-13 - 2015-09-19	80	2015-09-13	SL
2070	Rossmann_DE_SL	2015-09-20 - 2015-09-26	57	2015-09-20	SL
2071	Rossmann_DE_SL	2015-09-27 - 2015-10-03	0	2015-09-27	SL

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday
0	1	5	2015-07-31	5263	555	1	1	1
1	2	5	2015-07-31	6064	625	1	1	1
2	3	5	2015-07-31	8314	821	1	1	1
3	4	5	2015-07-31	13995	1498	1	1	1
4	5	5	2015-07-31	4822	559	1	1	1

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday
Total	0	0	0	0	0	0	0	0	0
Percent	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
Types	int64	int64	object	int64	int64	int64	int64	object	int64

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval	State	file	week	trend	file_DE	week_DE	trend_DE	Date_DE	State_DE	Month_DE	Day_DE	Dayofweek_DE	Dayofyear_DE	Is_month_end_DE	Is_month_start_DE	Is_quarter_end_DE	Is_quarter_start_DE	Is_year_end_DE	Is_year_start_DE	Elapsed_DE	Max_TemperatureC	Mean_TemperatureC	Min_TemperatureC	Dew_PointC	MeanDew_PointC	Min_DewpointC	Max_Humidity	Mean_Humidity	Min_Humidity	Max_Sea_Level_PressurehPa	Mean_Sea_Level_PressurehPa	Min_Sea_Level_PressurehPa	Max_VisibilityKm	Mean_VisibilityKm	Min_VisibilitykM	Max_Wind_SpeedKm_h	Mean_Wind_SpeedKm_h	Max_Gust_SpeedKm_h	Precipitationmm	CloudCover	Events	WindDirDegrees	StateName	CompetitionOpenSince	CompetitionDaysOpen	CompetitionMonthsOpen	Promo2Since	Promo2Days
0	1	5	2015-07-31	5263	555	1	1	False	1	2015	7	31	31	4	212	True	False	False	False	False	False	1438300800	c	a	1270.000000	9	2008	0	1	1900	NaN	HE	Rossmann_DE_HE	2015-08-02 - 2015-08-08	85	Rossmann_DE	2015-08-02 - 2015-08-08	83	2015-08-02	None	8	2	6	214	False	False	False	False	False	False	1438473600	23	16	8	9	6	3	98	54	18	1021	1018	1015	31.000000	15.000000	10.000000	24	11	nan	0.000000	1.000000	Fog	13	Hessen	2008-09-15	2510	24	1900-01-01	42214
1	2	5	2015-07-31	6064	625	1	1	False	1	2015	7	31	31	4	212	True	False	False	False	False	False	1438300800	a	a	570.000000	11	2007	1	13	2010	Jan,Apr,Jul,Oct	TH	Rossmann_DE_TH	2015-08-02 - 2015-08-08	80	Rossmann_DE	2015-08-02 - 2015-08-08	83	2015-08-02	None	8	2	6	214	False	False	False	False	False	False	1438473600	19	13	7	9	6	3	100	62	25	1021	1019	1017	10.000000	10.000000	10.000000	14	11	nan	0.000000	4.000000	Fog	309	Thueringen	2007-11-15	2815	24	2010-03-29	1950
2	3	5	2015-07-31	8314	821	1	1	False	1	2015	7	31	31	4	212	True	False	False	False	False	False	1438300800	a	a	14130.000000	12	2006	1	14	2011	Jan,Apr,Jul,Oct	NW	Rossmann_DE_NW	2015-08-02 - 2015-08-08	86	Rossmann_DE	2015-08-02 - 2015-08-08	83	2015-08-02	None	8	2	6	214	False	False	False	False	False	False	1438473600	21	13	6	10	7	4	100	61	24	1022	1019	1017	31.000000	14.000000	10.000000	14	5	nan	0.000000	2.000000	Fog	354	NordrheinWestfalen	2006-12-15	3150	24	2011-04-04	1579
3	4	5	2015-07-31	13995	1498	1	1	False	1	2015	7	31	31	4	212	True	False	False	False	False	False	1438300800	c	c	620.000000	9	2009	0	1	1900	NaN	BE	Rossmann_DE_BE	2015-08-02 - 2015-08-08	74	Rossmann_DE	2015-08-02 - 2015-08-08	83	2015-08-02	None	8	2	6	214	False	False	False	False	False	False	1438473600	19	14	9	9	7	4	94	61	30	1019	1017	1014	10.000000	10.000000	10.000000	23	16	nan	0.000000	6.000000	NaN	282	Berlin	2009-09-15	2145	24	1900-01-01	42214
4	5	5	2015-07-31	4822	559	1	1	False	1	2015	7	31	31	4	212	True	False	False	False	False	False	1438300800	a	a	29910.000000	4	2015	0	1	1900	NaN	SN	Rossmann_DE_SN	2015-08-02 - 2015-08-08	82	Rossmann_DE	2015-08-02 - 2015-08-08	83	2015-08-02	None	8	2	6	214	False	False	False	False	False	False	1438473600	20	15	10	8	6	5	82	55	26	1020	1018	1016	10.000000	10.000000	10.000000	14	11	nan	0.000000	4.000000	NaN	290	Sachsen	2015-04-15	107	3	1900-01-01	42214
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1017204	1111	2	2013-01-01	0	0	0	0	True	1	2013	1	1	1	1	1	False	True	False	True	False	True	1356998400	a	a	1900.000000	6	2014	1	31	2013	Jan,Apr,Jul,Oct	NW	Rossmann_DE_NW	2013-01-06 - 2013-01-12	63	Rossmann_DE	2013-01-06 - 2013-01-12	62	2013-01-06	None	1	6	6	6	False	False	False	False	False	False	1357430400	8	4	2	7	5	1	94	87	65	1013	1005	1001	31.000000	12.000000	4.000000	39	26	58.000000	5.080000	6.000000	Rain	215	NordrheinWestfalen	2014-06-15	0	0	2013-07-29	-209
1017205	1112	2	2013-01-01	0	0	0	0	True	1	2013	1	1	1	1	1	False	True	False	True	False	True	1356998400	c	c	1880.000000	4	2006	0	1	1900	NaN	NW	Rossmann_DE_NW	2013-01-06 - 2013-01-12	63	Rossmann_DE	2013-01-06 - 2013-01-12	62	2013-01-06	None	1	6	6	6	False	False	False	False	False	False	1357430400	8	4	2	7	5	1	94	87	65	1013	1005	1001	31.000000	12.000000	4.000000	39	26	58.000000	5.080000	6.000000	Rain	215	NordrheinWestfalen	2006-04-15	2453	24	1900-01-01	41273
1017206	1113	2	2013-01-01	0	0	0	0	True	1	2013	1	1	1	1	1	False	True	False	True	False	True	1356998400	a	c	9260.000000	1	1900	0	1	1900	NaN	SH	Rossmann_DE_SH	2013-01-06 - 2013-01-12	72	Rossmann_DE	2013-01-06 - 2013-01-12	62	2013-01-06	None	1	6	6	6	False	False	False	False	False	False	1357430400	8	7	6	7	7	5	100	95	93	1002	1000	999	10.000000	8.000000	5.000000	23	16	nan	0.000000	6.000000	Rain	251	SchleswigHolstein	1900-01-15	0	0	1900-01-01	41273
1017207	1114	2	2013-01-01	0	0	0	0	True	1	2013	1	1	1	1	1	False	True	False	True	False	True	1356998400	a	c	870.000000	1	1900	0	1	1900	NaN	HH	Rossmann_DE_HH	2013-01-06 - 2013-01-12	63	Rossmann_DE	2013-01-06 - 2013-01-12	62	2013-01-06	None	1	6	6	6	False	False	False	False	False	False	1357430400	8	6	4	8	6	3	100	92	70	1007	1001	997	31.000000	11.000000	6.000000	40	23	63.000000	6.100000	6.000000	Rain	234	Hamburg	1900-01-15	0	0	1900-01-01	41273
1017208	1115	2	2013-01-01	0	0	0	0	True	1	2013	1	1	1	1	1	False	True	False	True	False	True	1356998400	d	c	5350.000000	1	1900	1	22	2012	Mar,Jun,Sept,Dec	HE	Rossmann_DE_HE	2013-01-06 - 2013-01-12	73	Rossmann_DE	2013-01-06 - 2013-01-12	62	2013-01-06	None	1	6	6	6	False	False	False	False	False	False	1357430400	8	6	3	6	3	1	93	80	59	1015	1008	1006	31.000000	12.000000	10.000000	23	14	39.000000	2.030000	6.000000	Rain	206	Hessen	1900-01-15	0	0	2012-05-28	218

	Date	Store	AfterSchoolHoliday	AfterStateHoliday	AfterPromo	BeforePromo	Promo_bw	Promo_fw
0	2015-09-17	1	13	105	0	0	4.0	1.0
1	2015-09-16	1	12	104	0	0	3.0	2.0
2	2015-09-15	1	11	103	0	0	2.0	3.0
3	2015-09-14	1	10	102	0	0	1.0	4.0
4	2015-09-13	1	9	101	9	-1	0.0	4.0