import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import session_info
import pingouin as pg
if "jms_style_sheet" in plt.style.available:
plt.style.use("jms_style_sheet")
session_info.show(write_req_file=False) #add write_req_file=True to function to get requirements.txt file of packages used
----- matplotlib 3.5.1 numpy 1.21.5 pandas 1.4.2 pingouin 0.5.1 seaborn 0.11.2 session_info 1.0.0 -----
PIL 9.0.1 appnope 0.1.2 asttokens NA backcall 0.2.0 beta_ufunc NA binom_ufunc NA bottleneck 1.3.4 brotli NA certifi 2020.06.20 cffi 1.15.0 charset_normalizer 2.0.4 colorama 0.4.4 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 debugpy 1.5.1 decorator 5.1.1 defusedxml 0.7.1 entrypoints 0.4 executing 0.8.3 idna 3.3 ipykernel 6.9.1 ipython_genutils 0.2.0 jedi 0.18.1 jupyter_server 1.13.5 kiwisolver 1.3.1 littleutils NA matplotlib_inline NA mkl 2.4.0 mpl_toolkits NA nbinom_ufunc NA numexpr 2.8.1 outdated 0.2.1 packaging 21.3 pandas_flavor NA parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prompt_toolkit 3.0.20 ptyprocess 0.7.0 pure_eval 0.2.2 pydev_ipython NA pydevconsole NA pydevd 2.6.0 pydevd_concurrency_analyser NA pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.11.2 pyparsing 3.0.4 pytz 2021.3 requests 2.27.1 scipy 1.7.3 setuptools 61.2.0 six 1.16.0 socks 1.7.1 stack_data 0.2.0 statsmodels 0.13.2 tabulate 0.8.9 tornado 6.1 traitlets 5.1.1 urllib3 1.26.9 wcwidth 0.2.5 xarray 0.20.1 zmq 22.3.0
----- IPython 8.2.0 jupyter_client 7.2.2 jupyter_core 4.9.2 jupyterlab 3.3.2 notebook 6.4.8 ----- Python 3.10.4 (main, Mar 31 2022, 03:38:35) [Clang 12.0.0 ] macOS-10.16-x86_64-i386-64bit ----- Session information updated at 2022-06-11 13:28
df = pd.read_csv("data/shield_gjames_21-09-20_prepped.csv").drop("Unnamed: 0", axis=1)
display(df.head())
sdt_columns = df.filter(regex="sdt").columns.tolist()
drop_sdt = True
if drop_sdt:
df=df.drop(sdt_columns, axis=1)
df.shape
id | sampling_weight | demographic_gender | demographic_age | demographic_4_areas | demographic_8_areas | demographic_higher_education | behaviour_indoors_nonhouseholders | behaviour_close_contact | behaviour_quarantined | ... | intention_public_transport_recoded | intention_indoor_meeting_recoded | intention_restaurant_recoded | intention_pa_recoded | intention_composite | behaviour_indoors_nonhouseholders_recoded | behaviour_unmasked_recoded | behavior_composite | behavior_composite_recoded | intention_behavior_composite | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2.060959 | 2 | 60+ | 2 | 7 | 0 | 2 | 5 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | 2 | 1.784139 | 2 | 40-49 | 1 | 1 | 1 | 3 | 3 | 2 | ... | 0 | 1 | 1 | 1 | 3 | 0.785714 | 0.214286 | 0.168367 | 0.841837 | 1.920918 |
2 | 3 | 1.204000 | 1 | 60+ | 1 | 2 | 1 | 4 | 4 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0.500000 | 0.214286 | 0.107143 | 0.535714 | 0.267857 |
3 | 4 | 2.232220 | 1 | 60+ | 2 | 6 | 0 | 4 | 3 | 2 | ... | 0 | 2 | 0 | 2 | 4 | 0.500000 | 0.500000 | 0.250000 | 1.250000 | 2.625000 |
4 | 5 | 1.627940 | 2 | 18-29 | 1 | 3 | 0 | 6 | 3 | 2 | ... | 0 | 2 | 0 | 0 | 2 | 0.000000 | 0.214286 | 0.000000 | 0.000000 | 1.000000 |
5 rows × 106 columns
(2272, 87)
target = "intention_behavior_composite"
df[target] = (df[target] - 10) * -1
features_list = df.filter(regex="^automaticity|attitude|^norms|^risk|^effective").columns.tolist()
meta_columns = ['Original position', 'Variable name', 'Label',
'Item english translation ', 'Label short', 'Type', 'New variable name',
'variable name helper',
'Of primary interest as a predictor (i.e. feature)?', 'English lo-anchor',
'English hi-anchor']
sheet_id = "1BEX4W8XRGnuDk4Asa_pdKij3EIZBvhSPqHxFrDjM07k"
sheet_name = "Variable_names"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
meta_df = pd.read_csv(url).loc[:, meta_columns]
meta_list = df.filter(regex="^automaticity|attitude|^norms|^risk|^effective|^behaviour|^intention").columns.tolist()
pd.set_option("display.max_colwidth", 350)
pd.set_option('display.expand_frame_repr', True)
meta_df.loc[meta_df["New variable name"].isin(meta_list), ["Item english translation ", "Label short", "New variable name"]] #use Label Short instead of Item english translation for relabelling the axes
Item english translation | Label short | New variable name | |
---|---|---|---|
12 | How often in the last 7 days have you been indoors with people outside your household so that it is not related to obligations? For example, meeting friends, visiting hobbies, non-essential shopping, or other activities that are not required for your work or other duties.\n | Being indoors with people outside household | behaviour_indoors_nonhouseholders |
13 | In the last 7 days, have you been in close contact with people outside your household? Direct contact means spending more than one minute less than two meters away from another person or touching (e.g., shaking hands) outdoors or indoors. | Close contact | behaviour_close_contact |
14 | Are you currently in quarantine or isolation due to an official instruction or order? (For example, because you are waiting for a corona test, have returned from abroad or been exposed to a coronavirus) | Quarantine or isolation | behaviour_quarantined |
15 | How often in the last 7 days were you in your free time without a mask indoors with people you don’t live with? | Without a mask indoors with people outside household | behaviour_unmasked |
24 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Grocery store or other store\n | Intention to wear a mask grocery store or other store | intention_store |
25 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Bus, train or other means of public transport | Intention to wear a mask public transport | intention_public_transport |
26 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Meeting people outside your household indoors | Intention to wear a mask meeting people outside indoors | intention_indoor_meeting |
27 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Cafe, restaurant or bar indoors | Intention to wear a mask cafe, restaurant or bar | intention_restaurant |
28 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Indoor exercise | Intention to wear a mask indoor exercise | intention_pa |
29 | Taking a mask with you to a store or public transport, for example, has already become automatic for some and is done without thinking. For others, taking a mask with them is not automatic at all, but requires conscious thinking and effort. | Is taking a mask with you automatic for you? | automaticity_carry_mask |
30 | Putting on a mask, for example in a shop or on public transport, has already become automatic for some and it happens without thinking. For others, putting on a mask is not automatic at all, but requires conscious thinking and effort. | Is putting on a mask automatic for you? | automaticity_put_on_mask |
32 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | If or when I use a face mask… | inst_attitude_protects_self |
33 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | If or when I use a face mask… | inst_attitude_protects_others |
34 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | If or when I use a face mask… | inst_attitude_sense_of_community |
35 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | If or when I use a face mask… | inst_attitude_enough_oxygen |
36 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | If or when I use a face mask… | inst_attitude_no_needless_waste |
37 | Who thinks you should use a face mask and who thinks not? In the following questions, by using a face mask, we mean holding a cloth or disposable face mask, surgical mask, or respirator on the face so that it covers the nose and mouth. The questions concern leisure time. My family and friends think I should .. \n | My family and friends think I should .. | norms_family_friends |
38 | People at risk think I should .. | People at risk think I should .. | norms_risk_groups |
39 | The authorities think I should .. | The authorities think I should .. | norms_officials |
40 | In the indoors spaces I visit, people on the site think I should… | In the indoors spaces I visit, people on the site think I should… | norms_people_present_indoors |
41 | When I use a face mask, I feel or would feel ... | When I use a face mask, I feel or would feel ... | aff_attitude_comfortable |
42 | When I use a face mask, I feel or would feel ... | When I use a face mask, I feel or would feel ... | aff_attitude_calm |
43 | When I use a face mask, I feel or would feel ... | When I use a face mask, I feel or would feel ... | aff_attitude_safe |
44 | When I use a face mask, I feel or would feel ... | When I use a face mask, I feel or would feel ... | aff_attitude_responsible |
45 | When I use a face mask, I feel or would feel ... | When I use a face mask, I feel or would feel ... | aff_attitude_difficult_breathing |
61 | If two unvaccinated people from different households meet indoors, what means do you think would be effective in preventing coronavirus infection? Hand washing and use of gloves | Hand washing and use of gloves | effective_means_handwashing |
62 | Using a face mask | Using a face mask | effective_means_masks |
63 | Keeping a safety distance (2 meters) | Keeping a safety distance (2 meters) | effective_means_distance |
64 | Ventilation | Ventilation | effective_means_ventilation |
65 | How likely do you think you will get a coronavirus infection in your free time in the next month? | Perceived risk coronavirus infection | risk_likely_contagion |
66 | How likely do you think you would get a coronavirus infection in your free time in the next month if you did nothing to protect yourself from it?\r | Perceived risk coronavirus infection with no protective behaviours | risk_contagion_absent_protection |
67 | If you got a coronavirus infection, how serious a threat would you rate it to your health?\r | Perceived risk severity coronavirus infection | risk_severity |
68 | Spread of coronavirus… | Spread of coronavirus… | risk_fear_spread |
69 | The fact that I would get infected myself .. | I would get infected myself .. | risk_fear_contagion_self |
70 | That my loved one would get infected... | Loved one would get infected... | risk_fear_contagion_others |
71 | Consequences of measures taken to prevent the spread of the coronavirus... | Measures taken to prevent the spread | risk_fear_restrictions |
pd.set_option("display.max_colwidth", 100)
Check the amount of samples in the target
_ = sns.violinplot(data=df[[target]].melt(),
x="variable",
y="value"
)
_ = sns.stripplot(data=df[[target]].melt(),
x="variable",
y="value",
edgecolor='white',
linewidth=0.5
)
pd.crosstab(df["demographic_gender"], df["demographic_age"])
demographic_age | 18-29 | 30-39 | 40-49 | 50-59 | 60+ |
---|---|---|---|---|---|
demographic_gender | |||||
1 | 114 | 169 | 187 | 168 | 337 |
2 | 281 | 185 | 229 | 211 | 391 |
target_df = df[target]
target_df.describe().to_frame().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
intention_behavior_composite | 2272.0 | 8.582428 | 1.524704 | -0.0 | 8.017857 | 8.964286 | 9.5 | 10.0 |
df.filter(regex="demo").apply(lambda col: col.unique())
demographic_gender [2, 1] demographic_age [60+, 40-49, 18-29, 50-59, 30-39] demographic_4_areas [2, 1, 4, 3] demographic_8_areas [7, 1, 2, 6, 3, 8, 5, 4] demographic_higher_education [0, 1] demographic_risk_group [1, 2, 3] dtype: object
gender_cat = CategoricalDtype(categories=["Man", "Woman"], ordered=False)
age_cat = CategoricalDtype(categories=["18-29", "30-39", "40-49", "50-59", "60+"], ordered=True)
higher_education_boolean_cat = CategoricalDtype(categories=["Lower", "Higher"], ordered=True)
risk_group_cat = CategoricalDtype(categories=["Low", "Medium", "High"], ordered=True)
df = df.assign(**{"gender": lambda d: d["demographic_gender"].replace({1: "Man", 2: "Woman"}).astype(gender_cat),
"education": lambda d: d["demographic_higher_education"].replace({0: "Lower", 1: "Higher"}).astype(higher_education_boolean_cat),
"age": lambda d: d["demographic_age"].astype(age_cat),
"risk_group": lambda d: d["demographic_risk_group"].replace({1: "Low", 2: "Medium", 3: "High"}).astype(risk_group_cat)})
demographics_list = ["gender", "education", "age", "risk_group"]
demos_df = pd.concat([df[x].value_counts().sort_values().to_frame(name=f"{x}_amount").reset_index().rename(columns={"index": x}) for x in demographics_list], axis=1)
amount_cols = ["age_amount", "gender_amount", "risk_group_amount"]
pd.set_option("styler.format.precision", 0)
demos_df.style.bar(subset=amount_cols,
align='mid',
color=['#d65f5f', '#5fba7d']).format(na_rep="")
gender | gender_amount | education | education_amount | age | age_amount | risk_group | risk_group_amount | |
---|---|---|---|---|---|---|---|---|
0 | Man | 975 | Higher | 1053 | 30-39 | 354 | High | 131 |
1 | Woman | 1297 | Lower | 1219 | 50-59 | 379 | Low | 714 |
2 | 18-29 | 395 | Medium | 1427 | ||||
3 | 40-49 | 416 | ||||||
4 | 60+ | 728 |
!jupyter nbconvert --to html demographics.ipynb
[NbConvertApp] Converting notebook demographics.ipynb to html [NbConvertApp] Writing 723238 bytes to demographics.html