地理空间数据EDA数据探索性分析

EDA——数据探索性分析，是通过了解数据集的基本情况、变量间的相互关系以及变量与预测值之间的关系，为后期特征工程和建立模型做铺垫。本文以智慧海洋建设竞赛为例进行演示。

1. 总体了解数据

1.1 查看样本个数和原始特征维度

1	data_train.shape

1	data_test.shape

1	data_train.columns #查看列名

1
2
3

pd.set_option('display.max_info_rows',2699639)	#提高非缺失值检查的行数上线
#pd.options.display.max_info_rows = 2699639
data_train.info()

1 2	#查看count 非空值数、std 标准差、（25%、50%、75%）分位数等基本情况 data_train.describe([0.01,0.025,0.05,0.5,0.75,0.9,0.99])

1.2 查看缺失值和唯一值等

1	data_train.isnull().any() #查看缺失值

1 2	#查看含有缺失值的列名 data_train.columns[data_train.isnull().any()].tolist()

1
2
3

#查看仅有唯一值的特征
one_value_fea_train = [col for col in data_train.columns if data_train[col].nunique() <= 1]
one_value_fea_test = [col for col in data_test.columns if data_test[col].nunique() <= 1]

2. 查看数据特性和特征分布

2.1 渔船轨迹可视化

# 从每个类别中随机抽取三个渔船的轨迹进行可视化
def visualize_three_traj():
    fig,axes = plt.subplots(nrows=3,ncols=3,figsize=(20,15))
    plt.subplots_adjust(wspace=0.2,hspace=0.2)
    # 对于每一个类别，随机选出刺网的三条轨迹进行可视化
    lables = ["ciwang","weiwang","tuowang"]
    for i,file_type in tqdm(enumerate(["ciwang_data","weiwang_data","tuowang_data"])):
        data1, data2, data3 = get_random_three_traj(type=file_type)
        for j, datax in enumerate([data1, data2, data3]):
            x_data = datax["x"].loc[-1:].values
            y_data = datax["y"].loc[-1:].values
            axes[i][j - 1].scatter(x_data[0], y_data[0], label="start", c="red", s=20, marker="o")
            axes[i][j - 1].plot(x_data, y_data, label=lables[i])
            axes[i][j - 1].scatter(x_data[len(x_data) - 1], y_data[len(y_data) - 1], label="end", c="green", s=20,
                                   marker="D")
            axes[i][j - 1].grid(alpha=2)
            axes[i][j - 1].legend(loc="best")
    plt.show()
    
visualize_three_traj()

—————-图1——————

从图中可以发现,不同类别的轨迹有一定区分性.

刺网为规则多边形.

围网为包围的形状.

拖网为点到点,转弯次数少.

从轨迹数据可以猜测其特征可能为转弯的角度大小\转弯次数\起始点之间的距离和时间\经度和维度变化范围等.

此外,存在一些异常轨迹需要剔除.

3. 坐标序列可视化

# 随机选取某条数据，观察x坐标序列和y坐标序列的变化情况
def visualize_one_traj_x_y():
    fig,axes = plt.subplots(nrows=2,ncols=1,figsize=(10,8))
    plt.subplots_adjust(wspace=0.2,hspace=0.2)

    data1 = get_random_one_traj(type="weiwang_data")
    x = data1["x"].loc[-1:]
    x = x / 10000
    
    y = data1["y"].loc[-1:]
    y = y / 10000

    arr1 = np.arange(len(x))
    arr2 = np.arange(len(y))

    axes[0].plot(arr1,x,label="x")
    axes[1].plot(arr2,y,label="y")
    axes[0].grid(alpha=3)
    axes[0].legend(loc="best")
    axes[1].grid(alpha=3)
    axes[1].legend(loc="best")
    plt.show()
    
visualize_one_traj_x_y()

———-图2———–

由上图可知,存在同一时间段内x\y坐标均未变化,说明可能该时段内渔船正停留在某处.

4.三类渔船速度和方向可视化

# 每类轨迹，随机选取某个渔船，可视化速度序列和方向序列
def visualize_three_traj_speed_direction():
    fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(20,15))
    plt.subplots_adjust(wspace=0.1,hspace=0.1)
    # 随机选出刺网的三条轨迹进行可视化
    file_types = ["ciwang_data","weiwang_data","tuowang_data"]
    speed_types = ["ciwang_speed","weiwang_speed","tuowang_speed"]
    doirections = ["ciwang_direction","weiwang_direction","tuowang_direction"]
    colors = ['blue', 'red', 'brown']
    for i,file_name in tqdm(enumerate(file_types)):
        datax = get_random_one_traj(type=file_name)
        x_data = datax["速度"].loc[-1:].values
        y_data = datax["方向"].loc[-1:].values
        axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i], color=colors[i])
        axes[i][0].grid(alpha=2)
        axes[i][0].legend(loc="best")
        axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i], color=colors[i])
        axes[i][1].grid(alpha=2)
        axes[i][1].legend(loc="best")
    plt.show()

visualize_three_traj_speed_direction()

————图3—————-
由上图可知,不同分类渔船的轨迹速度某些时段均存在连续的低值情况,说明可能存在某些海上停留点;不同类别渔船的方向变化都很大,可能是海上漂泊导致,作为特征对于类别的区分度低,但也存在方向变化不大的时段,强化了对停留点存在的判断.

5.三类渔船速度和方向的数据分布

# 对某一特征进行数据统计
def get_data_cummulation(type,path,kind,columns):
    """
    type:"ciwang","weiwang" or "tuowang"
    path:数据路径
    kind:"速度"or"方向"
    columns:与kind对应，"speed"or"direction"
    """
    data_dict = dict()
    with open(path + type+".pkl","rb") as file:
        data_list = pickle.load(file)
    for datax in tqdm(data_list):
        data = datax[kind].values
        for speed in data:
            data_dict.setdefault(speed,0)
            data_dict[speed] += 1
    data_dict = dict(sorted(data_dict.items(),key=lambda x:x[0],reverse=False))
    data_df = pd.DataFrame.from_dict(data_dict,columns=[columns],orient="index")
    return data_df

# 分别得到速度和方向的分布数据
def get_speed_and_direction_distribution_data(type):
    path = "./data/"
    data_speed_df = get_data_cummulation(type=type, path=path,kind="速度",columns="speed")
    data_direction_df = get_data_cummulation(type=type,path=path,kind="方向",columns="direction")
    return data_speed_df,data_direction_df

# 可视化速度和方向的数据分布
df_speeds = []
df_directions = []

def plot_speed_direction1_distribution():
    plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
    plt.subplots_adjust(wspace=0.2, hspace=0.2)

    file_types = ["ciwang_data", "weiwang_data", "tuowang_data"]
    lables = ["ciwang", "weiwang", "tuowang"]
    colors = ["red", "blue", "green"]

    for i, filenames in enumerate(file_types):
        df11, df21 = get_speed_and_direction_distribution_data(file_types[i])
        plt.subplot(1,2,1)
        ax1 = sns.kdeplot(df11["speed"].values / 1000000, color=colors[i],shade=True)
        plt.subplot(1,2,2)
        ax3 = sns.kdeplot(df21["direction"].values / 1000000, color=colors[i],shade=True)
        df_speeds.append(df11)
        df_directions.append(df21)
    ax1.legend(lables)
    ax1.set_xlabel("Speed")
    ax3.set_xlabel("Direction")
    ax3.legend(lables)
    plt.show()

plot_speed_direction1_distribution()

———–图4———————

由上图可知,三种类别渔船的速度分布差异较大,而刺网和围网方向分布差异不明显,拖网方向分布有差异.