{
"cells": [
{
"cell_type": "markdown",
"id": "dc808241-9a7c-4272-a318-fc2a8580ae87",
"metadata": {},
"source": [
"# Pandas"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1d0f66fd-c46d-45a1-8b5f-04aaf57f8010",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "70640ee6-1d48-4d7f-a12d-ebb456874f47",
"metadata": {
"tags": []
},
"source": [
"### agg"
]
},
{
"cell_type": "markdown",
"id": "aca60913-823b-4d3b-8d9b-6f2852bca160",
"metadata": {},
"source": [
"**aggregation的时候顺带修改新生成列的名字:**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "07787439-cee5-43da-813f-41a60ff536aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" kind | \n",
" height | \n",
" weight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" cat | \n",
" 9.1 | \n",
" 7.9 | \n",
"
\n",
" \n",
" 1 | \n",
" dog | \n",
" 6.0 | \n",
" 7.5 | \n",
"
\n",
" \n",
" 2 | \n",
" cat | \n",
" 9.5 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 3 | \n",
" dog | \n",
" 34.0 | \n",
" 198.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" kind height weight\n",
"0 cat 9.1 7.9\n",
"1 dog 6.0 7.5\n",
"2 cat 9.5 9.9\n",
"3 dog 34.0 198.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],\n",
" 'height': [9.1, 6.0, 9.5, 34.0],\n",
" 'weight': [7.9, 7.5, 9.9, 198.0]})\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "570e0c8a-d86e-4474-863f-4df47d5388d7",
"metadata": {},
"source": [
"**aggregation的时候考虑多个列,并且改名:**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "2389cc9c-38ac-422d-a699-a863b9b64032",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" worst texture | \n",
" worst area | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 17.33 | \n",
" 2019.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 23.41 | \n",
" 1956.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 25.53 | \n",
" 1709.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 26.50 | \n",
" 567.7 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 16.67 | \n",
" 1575.0 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 564 | \n",
" 26.40 | \n",
" 2027.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 565 | \n",
" 38.25 | \n",
" 1731.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 566 | \n",
" 34.12 | \n",
" 1124.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 567 | \n",
" 39.42 | \n",
" 1821.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 568 | \n",
" 30.37 | \n",
" 268.6 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
569 rows × 3 columns
\n",
"
"
],
"text/plain": [
" worst texture worst area target\n",
"0 17.33 2019.0 0\n",
"1 23.41 1956.0 0\n",
"2 25.53 1709.0 0\n",
"3 26.50 567.7 0\n",
"4 16.67 1575.0 0\n",
".. ... ... ...\n",
"564 26.40 2027.0 0\n",
"565 38.25 1731.0 0\n",
"566 34.12 1124.0 0\n",
"567 39.42 1821.0 0\n",
"568 30.37 268.6 1\n",
"\n",
"[569 rows x 3 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import datasets\n",
"from sklearn.datasets import load_breast_cancer\n",
" \n",
"# data is loaded in a DataFrame\n",
"cancer_data = load_breast_cancer()\n",
"df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)\n",
"df['target'] = pd.Series(cancer_data.target)\n",
"df1 = df[['worst texture', 'worst area', 'target']]\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ae5b65f1-2caa-44a5-8c85-032f17cdd169",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" worst texture | \n",
" worst area | \n",
" target | \n",
" Mean worst texture | \n",
" Mean worst area | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 17.33 | \n",
" 2019.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 1 | \n",
" 23.41 | \n",
" 1956.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 2 | \n",
" 25.53 | \n",
" 1709.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 3 | \n",
" 26.50 | \n",
" 567.7 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 4 | \n",
" 16.67 | \n",
" 1575.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 564 | \n",
" 26.40 | \n",
" 2027.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 565 | \n",
" 38.25 | \n",
" 1731.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 566 | \n",
" 34.12 | \n",
" 1124.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 567 | \n",
" 39.42 | \n",
" 1821.0 | \n",
" 0 | \n",
" 29.318208 | \n",
" 1422.286321 | \n",
"
\n",
" \n",
" 568 | \n",
" 30.37 | \n",
" 268.6 | \n",
" 1 | \n",
" 23.515070 | \n",
" 558.899440 | \n",
"
\n",
" \n",
"
\n",
"
569 rows × 5 columns
\n",
"
"
],
"text/plain": [
" worst texture worst area target Mean worst texture Mean worst area\n",
"0 17.33 2019.0 0 29.318208 1422.286321\n",
"1 23.41 1956.0 0 29.318208 1422.286321\n",
"2 25.53 1709.0 0 29.318208 1422.286321\n",
"3 26.50 567.7 0 29.318208 1422.286321\n",
"4 16.67 1575.0 0 29.318208 1422.286321\n",
".. ... ... ... ... ...\n",
"564 26.40 2027.0 0 29.318208 1422.286321\n",
"565 38.25 1731.0 0 29.318208 1422.286321\n",
"566 34.12 1124.0 0 29.318208 1422.286321\n",
"567 39.42 1821.0 0 29.318208 1422.286321\n",
"568 30.37 268.6 1 23.515070 558.899440\n",
"\n",
"[569 rows x 5 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def meanofTargets(group1):\n",
" \n",
" wt = group1['worst texture'].agg('mean')\n",
" wa = group1['worst area'].agg('mean')\n",
" group1['Mean worst texture'] = wt\n",
" group1['Mean worst area'] = wa\n",
" return group1\n",
"df2 = df1.groupby('target').apply(meanofTargets)\n",
"df2"
]
},
{
"cell_type": "markdown",
"id": "e4b588d7-0f9f-457f-8d64-b325f02fcd49",
"metadata": {},
"source": [
"**类比dplyr和SQL的聚合**\n",
"\n",
"pandas实现GROUP BY环境下的SUM() / SUM()并且能对输出命名的过程是比较复杂的,举一个小例子:"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "d2351097-a58d-4c8d-acb3-848b773a1465",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" task | \n",
" experiments | \n",
" strategy | \n",
" n_layer_drop | \n",
" layer_drop | \n",
" Num examples | \n",
" inference_time | \n",
" train_time | \n",
" n_epoch | \n",
" train_time_per_epoch | \n",
" ... | \n",
" inference_throughput | \n",
" train_time_per_epoch_baseline | \n",
" inference_latency_baseline | \n",
" inference_throughput_baseline | \n",
" diff_train_time_per_epoch | \n",
" percentage_diff_train_time_per_epoch | \n",
" diff_inference_latency | \n",
" percentage_diff_inference_latency | \n",
" diff_inference_throughput | \n",
" percentage_diff_inference_throughput | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" cola | \n",
" Remove Layers | \n",
" Bottom Drop | \n",
" 2 | \n",
" 0,1 | \n",
" 1043.0 | \n",
" 9.267023 | \n",
" 326.441859 | \n",
" 3 | \n",
" 108.813953 | \n",
" ... | \n",
" 112.549625 | \n",
" 138.738982 | \n",
" 0.008906 | \n",
" 112.284841 | \n",
" -29.925029 | \n",
" -0.275011 | \n",
" -0.000021 | \n",
" -0.002358 | \n",
" 0.264784 | \n",
" 0.002353 | \n",
"
\n",
" \n",
" 1 | \n",
" cola | \n",
" Remove Layers | \n",
" Bottom Drop | \n",
" 4 | \n",
" 0,1,2,3 | \n",
" 1043.0 | \n",
" 9.261514 | \n",
" 262.275043 | \n",
" 3 | \n",
" 87.425014 | \n",
" ... | \n",
" 112.616577 | \n",
" 138.738982 | \n",
" 0.008906 | \n",
" 112.284841 | \n",
" -51.313968 | \n",
" -0.586948 | \n",
" -0.000026 | \n",
" -0.002954 | \n",
" 0.331736 | \n",
" 0.002946 | \n",
"
\n",
" \n",
" 2 | \n",
" cola | \n",
" Remove Layers | \n",
" Bottom Drop | \n",
" 6 | \n",
" 0,1,2,3,4,5 | \n",
" 1043.0 | \n",
" 9.268895 | \n",
" 198.315210 | \n",
" 3 | \n",
" 66.105070 | \n",
" ... | \n",
" 112.526899 | \n",
" 138.738982 | \n",
" 0.008906 | \n",
" 112.284841 | \n",
" -72.633912 | \n",
" -1.098765 | \n",
" -0.000019 | \n",
" -0.002156 | \n",
" 0.242058 | \n",
" 0.002151 | \n",
"
\n",
" \n",
" 3 | \n",
" cola | \n",
" Remove Layers | \n",
" Symmetric Drop | \n",
" 2 | \n",
" 5,6 | \n",
" 1043.0 | \n",
" 9.273199 | \n",
" 326.141196 | \n",
" 3 | \n",
" 108.713732 | \n",
" ... | \n",
" 112.474666 | \n",
" 138.738982 | \n",
" 0.008906 | \n",
" 112.284841 | \n",
" -30.025250 | \n",
" -0.276186 | \n",
" -0.000015 | \n",
" -0.001691 | \n",
" 0.189825 | \n",
" 0.001688 | \n",
"
\n",
" \n",
" 4 | \n",
" cola | \n",
" Remove Layers | \n",
" Symmetric Drop | \n",
" 4 | \n",
" 4,5,6,7 | \n",
" 1043.0 | \n",
" 9.271496 | \n",
" 262.707250 | \n",
" 3 | \n",
" 87.569083 | \n",
" ... | \n",
" 112.495329 | \n",
" 138.738982 | \n",
" 0.008906 | \n",
" 112.284841 | \n",
" -51.169899 | \n",
" -0.584337 | \n",
" -0.000017 | \n",
" -0.001875 | \n",
" 0.210488 | \n",
" 0.001871 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 21 columns
\n",
"
"
],
"text/plain": [
" task experiments strategy n_layer_drop layer_drop \\\n",
"0 cola Remove Layers Bottom Drop 2 0,1 \n",
"1 cola Remove Layers Bottom Drop 4 0,1,2,3 \n",
"2 cola Remove Layers Bottom Drop 6 0,1,2,3,4,5 \n",
"3 cola Remove Layers Symmetric Drop 2 5,6 \n",
"4 cola Remove Layers Symmetric Drop 4 4,5,6,7 \n",
"\n",
" Num examples inference_time train_time n_epoch train_time_per_epoch \\\n",
"0 1043.0 9.267023 326.441859 3 108.813953 \n",
"1 1043.0 9.261514 262.275043 3 87.425014 \n",
"2 1043.0 9.268895 198.315210 3 66.105070 \n",
"3 1043.0 9.273199 326.141196 3 108.713732 \n",
"4 1043.0 9.271496 262.707250 3 87.569083 \n",
"\n",
" ... inference_throughput train_time_per_epoch_baseline \\\n",
"0 ... 112.549625 138.738982 \n",
"1 ... 112.616577 138.738982 \n",
"2 ... 112.526899 138.738982 \n",
"3 ... 112.474666 138.738982 \n",
"4 ... 112.495329 138.738982 \n",
"\n",
" inference_latency_baseline inference_throughput_baseline \\\n",
"0 0.008906 112.284841 \n",
"1 0.008906 112.284841 \n",
"2 0.008906 112.284841 \n",
"3 0.008906 112.284841 \n",
"4 0.008906 112.284841 \n",
"\n",
" diff_train_time_per_epoch percentage_diff_train_time_per_epoch \\\n",
"0 -29.925029 -0.275011 \n",
"1 -51.313968 -0.586948 \n",
"2 -72.633912 -1.098765 \n",
"3 -30.025250 -0.276186 \n",
"4 -51.169899 -0.584337 \n",
"\n",
" diff_inference_latency percentage_diff_inference_latency \\\n",
"0 -0.000021 -0.002358 \n",
"1 -0.000026 -0.002954 \n",
"2 -0.000019 -0.002156 \n",
"3 -0.000015 -0.001691 \n",
"4 -0.000017 -0.001875 \n",
"\n",
" diff_inference_throughput percentage_diff_inference_throughput \n",
"0 0.264784 0.002353 \n",
"1 0.331736 0.002946 \n",
"2 0.242058 0.002151 \n",
"3 0.189825 0.001688 \n",
"4 0.210488 0.001871 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"layer_drop_time = pd.read_pickle('data/pandas_example_layer_drop_time.pickle')\n",
"def get_speedup(df):\n",
" df_output = df[['task', 'n_layer_drop']].iloc[0]\n",
" df_output['speedup'] = sum(df['inference_latency']) / sum(df['inference_latency_baseline'])\n",
" return df_output\n",
"layer_drop_time.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0e59ef8d-dd68-441a-81d7-ecc0fd7a2505",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" task | \n",
" n_layer_drop | \n",
" speedup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" cola | \n",
" 2 | \n",
" 0.998095 | \n",
"
\n",
" \n",
" 1 | \n",
" cola | \n",
" 4 | \n",
" 0.997573 | \n",
"
\n",
" \n",
" 2 | \n",
" cola | \n",
" 6 | \n",
" 0.997965 | \n",
"
\n",
" \n",
" 3 | \n",
" mrpc | \n",
" 2 | \n",
" 0.998157 | \n",
"
\n",
" \n",
" 4 | \n",
" mrpc | \n",
" 4 | \n",
" 1.000850 | \n",
"
\n",
" \n",
" 5 | \n",
" mrpc | \n",
" 6 | \n",
" 1.000129 | \n",
"
\n",
" \n",
" 6 | \n",
" rte | \n",
" 2 | \n",
" 0.837115 | \n",
"
\n",
" \n",
" 7 | \n",
" rte | \n",
" 4 | \n",
" 0.721320 | \n",
"
\n",
" \n",
" 8 | \n",
" rte | \n",
" 6 | \n",
" 0.543230 | \n",
"
\n",
" \n",
" 9 | \n",
" sst-2 | \n",
" 2 | \n",
" 0.998929 | \n",
"
\n",
" \n",
" 10 | \n",
" sst-2 | \n",
" 4 | \n",
" 0.998318 | \n",
"
\n",
" \n",
" 11 | \n",
" sst-2 | \n",
" 6 | \n",
" 0.998895 | \n",
"
\n",
" \n",
" 12 | \n",
" sts-b | \n",
" 2 | \n",
" 1.347481 | \n",
"
\n",
" \n",
" 13 | \n",
" sts-b | \n",
" 4 | \n",
" 1.056215 | \n",
"
\n",
" \n",
" 14 | \n",
" sts-b | \n",
" 6 | \n",
" 0.805969 | \n",
"
\n",
" \n",
" 15 | \n",
" wnli | \n",
" 2 | \n",
" 0.819518 | \n",
"
\n",
" \n",
" 16 | \n",
" wnli | \n",
" 4 | \n",
" 0.660700 | \n",
"
\n",
" \n",
" 17 | \n",
" wnli | \n",
" 6 | \n",
" 0.539791 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" task n_layer_drop speedup\n",
"0 cola 2 0.998095\n",
"1 cola 4 0.997573\n",
"2 cola 6 0.997965\n",
"3 mrpc 2 0.998157\n",
"4 mrpc 4 1.000850\n",
"5 mrpc 6 1.000129\n",
"6 rte 2 0.837115\n",
"7 rte 4 0.721320\n",
"8 rte 6 0.543230\n",
"9 sst-2 2 0.998929\n",
"10 sst-2 4 0.998318\n",
"11 sst-2 6 0.998895\n",
"12 sts-b 2 1.347481\n",
"13 sts-b 4 1.056215\n",
"14 sts-b 6 0.805969\n",
"15 wnli 2 0.819518\n",
"16 wnli 4 0.660700\n",
"17 wnli 6 0.539791"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"layer_drop_time.groupby(['task', 'n_layer_drop'], as_index=False).apply(get_speedup).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"id": "41f61118-3a10-4209-92e1-731b700c0369",
"metadata": {},
"source": [
"- 这个地方apply传入的`df`其实就是shuffle好的每个group,而且包含了group key,\n",
"\n",
"- 我们可以写一个类似reduce的函数,但第一步需要先把group的column给deduplicate了~(不做的话就写成一个复杂版的transform的了)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}