Movement annotation I: Preparing training data and data for classifier

Overview

Since we have around 9000 trials in the final dataset, it is not feasible to manually annotate the movement onset and offset for each trial. Instead, we will use a simple logistic regression model to predict the movement onset and offset from all the movement features we have collected into the merge dataset.

We have annotated the movement onset and offset in Wittenburg et al. (2006) for a pilot data (dyad 0). Two annotators have independently annotated the movement onset and offset for four tiers:

- upper body
- lower body
- arms
- head

Parent tier ‘movement’ summarizes overal movement across all tiers.

Now, we will use these ground truth annotations to create a training set for the logistic regression model.

Code to prepare the environment

# Packages
import os
import glob
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import glob


curfolder = os.getcwd()

# Here we store our timeseries data
processedfolder = os.path.join(curfolder + '\\..\\03_TS_processing\\TS_merged\\')
processedfiles = glob.glob(processedfolder + '\\merged*.csv')
processedfiles = [x for x in processedfiles if 'anno' not in x]

# Here we will store the training data
datasetfolder = os.path.join(curfolder + '\\TrainingData\\')

# Here we store the data ready to classify
chunked_folder = os.path.join(curfolder + '\\TS_forClassifying\\')

Preparing manual annotations

Our annotators are annotating only movement, so first we need to also fill in the missing space by nomovement values .

Custom functions

# Function to add no-movement annotations to the ELAN file
def add_nomovement_annotations(xml_file_path, newfilepath):
    # Load the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Extract all time slots
    time_slots = {}
    for time_slot in root.find('TIME_ORDER').findall('TIME_SLOT'):
        time_slots[time_slot.attrib['TIME_SLOT_ID']] = int(time_slot.attrib['TIME_VALUE'])

    # Sort time slots by TIME_VALUE
    sorted_time_slots = sorted(time_slots.items(), key=lambda x: x[1])
    time_slot_ids = [ts[0] for ts in sorted_time_slots]
    time_values = [ts[1] for ts in sorted_time_slots]

    # Loop over all tiers
    for tier in root.findall('TIER'):
        annotations = tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION')

        if not annotations:
            # If no annotations exist, add a single 'nomovement' annotation covering the whole tier
            new_annotation = ET.Element('ANNOTATION')
            alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
            alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
            alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
            annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
            annotation_value.text = 'nomovement'
            tier.append(new_annotation)
        else:
            # Sort annotations by start time
            sorted_annotations = sorted(annotations, key=lambda x: time_slots[x.attrib['TIME_SLOT_REF1']])
            
            # Handle the first annotation
            first_annotation = sorted_annotations[0]
            first_start_time = time_slots[first_annotation.attrib['TIME_SLOT_REF1']]
            if first_start_time > time_values[0]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
                alignable_annotation.set('TIME_SLOT_REF2', first_annotation.attrib['TIME_SLOT_REF1'])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

            # Handle gaps between annotations
            for i in range(len(sorted_annotations) - 1):
                current_annotation = sorted_annotations[i]
                next_annotation = sorted_annotations[i + 1]
                current_end_time = time_slots[current_annotation.attrib['TIME_SLOT_REF2']]
                next_start_time = time_slots[next_annotation.attrib['TIME_SLOT_REF1']]
                if current_end_time < next_start_time:
                    new_annotation = ET.Element('ANNOTATION')
                    alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                    alignable_annotation.set('TIME_SLOT_REF1', current_annotation.attrib['TIME_SLOT_REF2'])
                    alignable_annotation.set('TIME_SLOT_REF2', next_annotation.attrib['TIME_SLOT_REF1'])
                    annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                    annotation_value.text = 'nomovement'
                    tier.append(new_annotation)

            # Handle the last annotation
            last_annotation = sorted_annotations[-1]
            last_end_time = time_slots[last_annotation.attrib['TIME_SLOT_REF2']]
            if last_end_time < time_values[-1]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', last_annotation.attrib['TIME_SLOT_REF2'])
                alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

    # Save the modified XML file as a new file
    tree.write(newfilepath, encoding='UTF-8', xml_declaration=True)

manualanno_folder_r1 = curfolder + '/ManualAnno/R1/'            # Annotator 1 (AC)
manualanno_folder_r3 = curfolder + '/ManualAnno/R3/'            # Annotator 2 (GR)

manualannofiles1 = glob.glob(manualanno_folder_r1 + '/*.eaf')
# get rid of those with ELAN_tiers.eaf
manualannofiles1 = [x for x in manualannofiles1 if 'ELAN_tiers' not in x]
manualannofiles3 = glob.glob(manualanno_folder_r3 + '/*.eaf')
# get rid of those with ELAN_tiers.eaf
manualannofiles3 = [x for x in manualannofiles3 if 'ELAN_tiers' not in x]


for file in manualannofiles3:
    print('working on ' + file)

    # New filename is without third part of the name
    newfile = file.split('\\')[-1]
    chunks = newfile.split('_')
    if 'corrected' in file:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-4])
        else:
            newfile = '_'.join(chunks[:-3])
    else:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-3])
        else:
            newfile = '_'.join(chunks[:-2]) 

    newfile = newfile.replace('trial_', '')
    
    # Save it again
    newfile = manualanno_folder_r3 + newfile + '_ELAN_tiers.eaf'

    add_nomovement_annotations(file, newfile)

Now, we need to get the manual annotation from ELAN to simple text file so that we can merge the timeseries we prepared in merging script with information about movement in the trial

Custom functions

# Function to parse elan file
def parse_eaf_file(eaf_file, rel_tiers):
    tree = ET.parse(eaf_file)
    root = tree.getroot()

    time_order = root.find('TIME_ORDER')
    time_slots = {time_slot.attrib['TIME_SLOT_ID']: time_slot.attrib['TIME_VALUE'] for time_slot in time_order}

    annotations = []
    relevant_tiers = {rel_tiers}

    for tier in root.findall('TIER'):
        tier_id = tier.attrib['TIER_ID']
        if tier_id in relevant_tiers:
            for annotation in tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION'):
                # Ensure required attributes are present
                if 'TIME_SLOT_REF1' in annotation.attrib and 'TIME_SLOT_REF2' in annotation.attrib:
                    ts_ref1 = annotation.attrib['TIME_SLOT_REF1']
                    ts_ref2 = annotation.attrib['TIME_SLOT_REF2']
                    # Get annotation ID if it exists, otherwise set to None
                    ann_id = annotation.attrib.get('ANNOTATION_ID', None)
                    annotation_value = annotation.find('ANNOTATION_VALUE').text.strip()
                    annotations.append({
                        'tier_id': tier_id,
                        'annotation_id': ann_id,
                        'start_time': time_slots[ts_ref1],
                        'end_time': time_slots[ts_ref2],
                        'annotation_value': annotation_value
                    })

    return annotations

# Function to load annotations into csv
def fillAnno(TSfile, ANNOfile, colname):
    TSfile[colname] = None
    for row in ANNOfile.iterrows():
        start = row[1][0]
        end = row[1][1]
        TSfile.loc[(TSfile['time'] >= start) & (TSfile['time'] <= end), colname] = row[1][2]

Now we create text file for each tier separately, saving only the start time, end time and value (movement or nomovement) for each annotation file.

# These are the manual annotations adapted with no-movement annotations
annofolder_manu = os.path.join(curfolder + '\\ManualAnno\\R1\\')
annofiles_manu = glob.glob(annofolder_manu + '*ELAN_tiers.eaf')

################
#### arms ######
################

arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'

with open(arms_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'arms')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### upper body####
###################

upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'

with open(upperbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'upper_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### lower body####
###################

lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'

with open(lowerbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'lower_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
##### head ########
###################

head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

with open(head_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'head_mov')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

Preparing data for classifier

In the following code, we merge the annotations with our merged files so that we can later sample from the data based on the annotations.

We will now also filter some superfluous information, as well as add some more such as:

distance of LIndex to RIndex
distance of Wrist to Hip
distance of Head to Hip
distance of Head to Ankle

# These are the annotations per tier that we just created from manual annotations
arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'
upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'
lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'
head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

for file in processedfiles:

    # TrialID
    trialid = file.split('\\')[-1].split('.')[0]
    trialid = trialid.replace('merged_', '')

    print('working on ' + trialid)

    # Load the merged file
    merged = pd.read_csv(file)
    
    # Load the annotations as df
    arms = pd.read_csv(arms_anno, sep='\t', header=None)
    ub = pd.read_csv(upperbody_anno, sep='\t', header=None)
    lb = pd.read_csv(lowerbody_anno, sep='\t', header=None)
    head = pd.read_csv(head_anno, sep='\t', header=None)

    annos = [arms, ub, lb, head]

    practice = False

    # Loop over each tier and fill values into timeseries
    for anno_df in annos:
        # Get the annotations for the trialid
        anno_trial = anno_df[anno_df[3] == trialid] 
        
        if anno_trial.empty:
            print('no annotations for ' + trialid)  # This will be the case of practice trials that were not annotated
            practice = True
            continue
        
        else:
            if anno_df.equals(arms):
                fillAnno(merged, anno_trial, 'arms')
            elif anno_df.equals(ub):
                fillAnno(merged, anno_trial, 'upper_body')
            elif anno_df.equals(lb):
                fillAnno(merged, anno_trial, 'lower_body')
            elif anno_df.equals(head):
                fillAnno(merged, anno_trial, 'head_mov')
            else:
                print('something went wrong')

    if practice:
        continue

    df = merged.copy()

    # Now we will also add some features that might be relevant for the classifier
    ## RWrist to LWrist in all dimensions
    df['wristDistance_x'] = df['RWrist_x'] - df['LWrist_x']
    df['wristDistance_y'] = df['RWrist_y'] - df['LWrist_y']
    df['wristDistance_z'] = df['RWrist_z'] - df['LWrist_z']

    ## RWrist to RHip
    df['RwristRhipDistance_x'] = df['RWrist_x'] - df['RHip_x']
    df['RwristRhipDistance_y'] = df['RWrist_y'] - df['RHip_y']
    df['RwristRhipDistance_z'] = df['RWrist_z'] - df['RHip_z']

    ## RWrist to LHip
    df['RwristLhipDistance_x'] = df['RWrist_x'] - df['LHip_x']
    df['RwristLhipDistance_y'] = df['RWrist_y'] - df['LHip_y']
    df['RwristLhipDistance_z'] = df['RWrist_z'] - df['LHip_z']

    ## LWrist to LHip
    df['LwristLhipDistance_x'] = df['LWrist_x'] - df['LHip_x']
    df['LwristLhipDistance_y'] = df['LWrist_y'] - df['LHip_y']
    df['LwristLhipDistance_z'] = df['LWrist_z'] - df['LHip_z']

    ## LWrist to RHip
    df['LwristRhipDistance_x'] = df['LWrist_x'] - df['RHip_x']
    df['LwristRhipDistance_y'] = df['LWrist_y'] - df['RHip_y']
    df['LwristRhipDistance_z'] = df['LWrist_z'] - df['RHip_z']

    ## Head to RHip
    df['HeadRhipDistance_x'] = df['Head_x'] - df['RHip_x']
    df['HeadRhipDistance_y'] = df['Head_y'] - df['RHip_y']
    df['HeadRhipDistance_z'] = df['Head_z'] - df['RHip_z']

    ## Head to RAnkle
    df['HeadRankleDistance_x'] = df['Head_x'] - df['RAnkle_x']
    df['HeadRankleDistance_y'] = df['Head_y'] - df['RAnkle_y']
    df['HeadRankleDistance_z'] = df['Head_z'] - df['RAnkle_z']


    # Get rid of superfluous columns
    df = df.drop(columns=['left_back', 'right_forward', 'right_back', 'left_forward', 'COPXc', 'COPYc', 'FileInfo'])

    # And we also don't need vocal features
    cols = df.columns
    colstodrop = ['envelope', 'audio', 'envelope_change', 'audio', 'f0', 'f1', 'f2', 'f3', 'env_', 'CoG']
    newcols = [col for col in cols if not any(x in col for x in colstodrop)]
    df = df[newcols]   
                
    # Save it
    df.to_csv(curfolder + '\\TS_annotated\\merged_anno_' + trialid + '.csv', index=False)

Now we are ready to create the training set for the logistic regression model.

Sumarizing features for training dataset

Now we will sample windows from movement and nomovement for each tier and summarize the available features in terms of mean, sd, min and max. Note that we will not create training data for each tier separately. Sometimes, it may be useful to predict movement of a specific body part with the information about other body part.

Each tier varies in the length of movement and non-movement chunk, but we will proceed in uniform way, setting threshold of 50 rows, i.e., 100 ms. In such a short period of time, it is anyway difficult to initiate any meaningful movement in any of the tiers of interest - head, arms, upper body, lower body.

We will sample the windows randomly, but also make sure there is enough border cases (i.e., windows that capture end or beginning of the movement). Our participants are ‘locking’ hands in the beginning and in the end of each performance. Our classifier should know that these are not ‘communicative’ movements per se.

(Note that this code takes a while to execute.)

Custom functions

# Function to sample random consecutive rows from a df
def select_random_consecutive_rows(df, change_col, threshold):
    # Group the DataFrame by the 'change' column
    grouped = df.groupby(change_col)
    
    # List to hold the selected rows
    selected_rows = []

    # Loop over each group
    for group_df in grouped:
        # get the first index
        idx_start = group_df[1].index[0]
        # get the last index
        idx_last = group_df[1].index[-1]
        # Check if the group is large enough to select 'threshold' rows
        if len(group_df[1]) >= threshold:
            # Randomly choose a starting index for consecutive selection that is within the index range of the group
            start_idx = np.random.randint(idx_start, idx_last - threshold + 1)
            # Select consecutive rows from that start index
            selected = df.loc[start_idx:start_idx + threshold - 1]
            selected_rows.append(selected)
    
    # Concatenate all selected rows into a single DataFrame
    result_df = pd.concat(selected_rows)
    
    return result_df

# Transforming the dictionary into a df
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'feature_mean', 'feature_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value

    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    
    return df

# Set seed for reproducibility
np.random.seed(42)

# These are our timeseries to be sampled from
samplingfolder = os.path.join(curfolder + '/TS_annotated/')
samplingfiles = glob.glob(samplingfolder + '*.csv')

tiers = ['arms', 'upper_body', 'lower_body', 'head_mov']
threshold_m = 50 # threshold for movement (100 ms)
threshold_nm = 50 # threshold for no movement

for tierofinterest in tiers:
    dataset_features = pd.DataFrame()
    summaries_m = {}
    summaries_nm = {}

    counter = 1

    for file in samplingfiles:
        df = pd.read_csv(file)

        # If the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
        if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
            print('skipping ' + file)
            continue

        # TrialID
        trialid = file.split('\\')[-1].split('.')[0]

        # Annotate unique movement/no movement chunks
        df['row_change'] = df[tierofinterest].ne(df[tierofinterest].shift()).cumsum()

        # Sample random 5 samples of the threshold length in both movement and no movement in tier
        tier_m = df[df[tierofinterest] == 'movement']
        tier_nm = df[df[tierofinterest] == 'nomovement']

        if not tier_m.empty:
            # 10 samples
            for i in range(10):
                tier_m_sample = select_random_consecutive_rows(tier_m, 'row_change', threshold_m)

                # Get summaries for numerical columns
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # Get stats and save them to dictionary
                    stats = tier_m_sample[col].describe().to_dict()
                    summaries_m[col] = stats

                # Dictionary to df row
                summary_row_m = dict_to_df(summaries_m)
                # We don't need count stats
                summary_row_m = summary_row_m.loc[:, ~summary_row_m.columns.str.contains('count|%', regex=True)]
                # Add metainfo
                summary_row_m['trialid'] = trialid
                summary_row_m['eventid'] = trialid + '_mov_' + str(counter)
                summary_row_m['anno_value'] = 'movement'

                # Add row to the main df
                dataset_features = pd.concat([dataset_features, summary_row_m])
                counter += 1
            
        counter = 1

        if not tier_nm.empty:
            for i in range(10):
                tier_nm_sample = select_random_consecutive_rows(tier_nm, 'row_change', threshold_nm)
                # Get summaries for numerical columns
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # Get stats and save them to dictionary
                    stats = tier_nm_sample[col].describe().to_dict()
                    summaries_nm[col] = stats

                # Dictionary to df row
                summary_row_nm = dict_to_df(summaries_nm)
                summary_row_nm = summary_row_nm.loc[:, ~summary_row_nm.columns.str.contains('count|%', regex=True)]

                # Add metainfo
                summary_row_nm['trialid'] = trialid
                summary_row_nm['eventid'] = trialid + '_nonmov_' + str(counter)
                summary_row_nm['anno_value'] = 'nomovement'

                # Add row to the main df
                dataset_features = pd.concat([dataset_features, summary_row_nm])
                counter += 1

        counter = 1

        ###################### Process border windows

        border_rows = []

        # Identify the rows where the tierofinterest changes
        change_points = df[df['row_change'].diff().abs() > 0].index

        for idx in change_points:
            # Get the window before the change
            before_start = max(0, idx - 25)  # Ensure no negative index
            before_end = idx  # Up to the change point
            before_window = df.iloc[before_start:before_end]
            # Get the annotation value
            anno_value = df.loc[idx, tierofinterest]

            # Get the window after the change
            after_start = idx
            after_end = min(len(df), idx + 25)  # Ensure no index exceeds the DataFrame length
            after_window = df.iloc[after_start:after_end]

            # Process the 'before' window in the same way like classic chunks above
            if not before_window.empty:
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                summaries_before = {col: before_window[col].describe().to_dict() for col in num_cols}
                summary_row_before = dict_to_df(summaries_before)
                summary_row_before = summary_row_before.loc[:, ~summary_row_before.columns.str.contains('count|%', regex=True)]
                summary_row_before['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_before['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_before['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_before['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_before])
                counter += 1

            # Process the 'after' window in the same way like classic chunks above
            if not after_window.empty:
                summaries_after = {col: after_window[col].describe().to_dict() for col in num_cols}
                summary_row_after = dict_to_df(summaries_after)
                summary_row_after = summary_row_after.loc[:, ~summary_row_after.columns.str.contains('count|%', regex=True)]
                summary_row_after['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_after['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_after['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_after['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_after])
                counter += 1

    # Drop all columns with NaN values
    dataset_features = dataset_features.dropna(axis=1)
    # Save it
    filename = '\\dataset_' + tierofinterest + '_features.csv'
    dataset_features.to_csv(datasetfolder + filename, index=False)

This is how the dataset looks like

	COPc_mean	COPc_std	COPc_min	COPc_max	pelvis_tilt_moment_mean	pelvis_tilt_moment_std	pelvis_tilt_moment_min	pelvis_tilt_moment_max	pelvis_list_moment_mean	pelvis_list_moment_std	...	HeadRankleDistance_y_std	HeadRankleDistance_y_min	HeadRankleDistance_y_max	HeadRankleDistance_z_mean	HeadRankleDistance_z_std	HeadRankleDistance_z_min	HeadRankleDistance_z_max	trialid	eventid	anno_value
0	0.001029	0.000353	0.000497	0.001539	11.916939	0.523783	10.259013	12.440273	40.593621	1.214134	...	0.388264	-47.812929	-46.498456	142.714523	0.264402	142.303580	143.190783	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_1	movement
1	0.002310	0.000357	0.001384	0.002592	-5.251434	6.670124	-14.455521	6.433171	-43.388846	11.359658	...	0.951654	-46.226361	-43.045054	147.654005	0.819087	146.352625	149.044777	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_2	movement
2	0.001229	0.000654	0.000325	0.002082	4.686266	1.693475	0.823981	6.247819	24.678865	12.110897	...	0.668766	-50.287726	-48.094340	139.471133	0.828517	138.035172	140.793319	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_3	movement
3	0.010437	0.003827	0.005062	0.016617	9.534314	4.984207	1.780355	16.748078	-22.117817	12.681626	...	0.335743	-48.799024	-47.725756	143.580503	0.088882	143.457698	143.710256	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_4	movement
4	0.001004	0.000326	0.000497	0.001522	11.714350	0.759333	9.564795	12.252050	40.096545	1.999604	...	0.406749	-47.938470	-46.575235	142.787307	0.272442	142.360696	143.261214	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_5	movement
5	0.001461	0.000432	0.000692	0.002762	13.462955	1.314506	12.131364	15.747833	48.942522	10.038141	...	0.266764	-46.886244	-46.003796	142.217696	0.202835	141.921413	142.578043	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_6	movement
6	0.005301	0.000282	0.004481	0.005716	-9.424995	13.097252	-24.188497	14.602104	-123.839005	12.716710	...	0.598752	-26.614199	-24.623861	159.472153	0.330003	158.899835	160.012364	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_7	movement
7	0.004990	0.000920	0.003090	0.007364	12.518248	2.241052	9.648455	15.750968	87.260236	6.093794	...	0.132448	-45.982839	-45.587361	141.964505	0.093210	141.876556	142.181769	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_8	movement
8	0.003358	0.000680	0.002573	0.004292	14.285585	8.257143	-3.787133	23.009216	4.935150	15.441791	...	0.914203	-31.859550	-28.740297	156.858607	0.546987	155.927757	157.763944	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_9	movement
9	0.000474	0.000222	0.000257	0.001104	7.607259	1.802191	3.210280	9.359289	-80.437708	8.600917	...	0.621470	-31.547830	-29.416171	158.248435	0.507886	157.346089	159.038608	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_mov_10	movement
10	0.000327	0.000172	0.000081	0.000551	1.142706	2.774018	-2.489563	6.568694	-59.022974	8.534245	...	0.808444	-22.051561	-20.066378	159.953388	0.388046	159.529543	160.445080	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_nonmov_1	nomovement
11	0.000859	0.000627	0.000175	0.001681	1.956649	2.745580	-1.319160	6.667728	-60.887893	1.757583	...	0.224639	-21.057542	-20.421462	160.343627	0.493233	159.797253	160.919489	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_nonmov_2	nomovement
12	0.000508	0.000227	0.000243	0.001041	1.658757	0.735249	0.017040	3.188685	-67.195356	6.031778	...	2.212455	-25.605054	-20.390822	160.404137	0.263193	159.981512	160.727456	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_nonmov_3	nomovement
13	0.000446	0.000291	0.000050	0.001093	2.139294	1.726763	-0.846748	6.088638	-62.201977	2.714513	...	0.336568	-21.325884	-20.390822	160.209853	0.453761	159.732042	160.736602	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_nonmov_4	nomovement
14	0.000276	0.000111	0.000125	0.000471	1.090214	1.638015	-1.945115	3.100118	-63.428529	5.286197	...	1.425956	-24.065452	-20.488453	160.027971	0.174932	159.755498	160.246507	merged_anno_0_1_44_p0	merged_anno_0_1_44_p0_nonmov_5	nomovement

15 rows × 1263 columns

Preparing timeseries for classifying

Now we also prepare data for the classifier. We will use the same features as in the previous step, but now we will summarize whole timeseries from beginning to the end. Every time, we take 100ms chunk, and slide with 25ms step, such that we have overlap between the chunks and can later assess more accurately when exactly is the movement onsent/offset.

Custom functions

# Function to summarize every 50 rows with overlapping intervals, sliding by 12 rows
def summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12):
    summary_df = pd.DataFrame()
    counter = 1

    for start_idx in range(0, len(df), slide_step):
        # Select a slice of 50 rows (or fewer for the last chunk)
        selected = df.iloc[start_idx:start_idx + summary_interval]
        
        # Stop if there are no more rows to process
        if selected.empty:
            break
            
        summary_stats = {}

        # Calculate statistics for each numerical column
        for col in num_cols:
            stats = selected[col].describe().to_dict()
            summary_stats[col] = stats

        # Convert to DataFrame row format
        summary_row = dict_to_df(summary_stats)

        # Add start and end time for the chunk
        summary_row['start_time'] = selected['time'].iloc[0]
        summary_row['end_time'] = selected['time'].iloc[-1]

        # Add chunk number
        summary_row['eventid'] = f"{trialid}_chunk_{counter}"

        # Get rid of all columns that contain 'count' or '%' in the name
        summary_row = summary_row.loc[:, ~summary_row.columns.str.contains('count|%', regex=True)]

        # Append to the main DataFrame
        summary_df = pd.concat([summary_df, summary_row], ignore_index=True)

        counter += 1

        # If the selected already contains time of the last row, finish
        if selected['time'].iloc[-1] == df['time'].iloc[-1]:
            return summary_df

    return summary_df

# Main df to store all summaries
summary_df = pd.DataFrame()

for file in samplingfiles:

    # TrialID
    trialid = file.split('\\')[-1].split('.')[0]

    df = pd.read_csv(file)

    # If the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
    if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
        print('skipping ' + trialid)
        continue
    else:
        print('working on ' + trialid)

    # Define numerical columns (excluding 'time' and 'change' if present)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'change' and col != 'time']

    # Summarize data in intervals of 50 rows, sliding by 12 rows
    summary_df = summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12)

    # Add trial ID 
    summary_df['trialid'] = trialid

    # Save it
    summary_df.to_csv(chunked_folder + trialid + '_chunked.csv', index=False)

print('All done, now we can proceed with annotation with our classifier')

This is an example file that contains timeseries processed into chunks

	COPc_mean	COPc_std	COPc_min	COPc_max	pelvis_tilt_moment_mean	pelvis_tilt_moment_std	pelvis_tilt_moment_min	pelvis_tilt_moment_max	pelvis_list_moment_mean	pelvis_list_moment_std	...	HeadRankleDistance_y_min	HeadRankleDistance_y_max	HeadRankleDistance_z_mean	HeadRankleDistance_z_std	HeadRankleDistance_z_min	HeadRankleDistance_z_max	start_time	end_time	eventid	trialid
0	0.000209	0.000104	0.000023	0.000360	-4.822713	0.329021	-5.373228	-4.274271	24.058442	0.542426	...	157.008942	157.158540	7.932183	0.016104	7.913146	7.966076	0.0	98.0	merged_anno_0_1_4_p0_chunk_1	merged_anno_0_1_4_p0
1	0.000264	0.000098	0.000118	0.000380	-4.552120	0.326263	-5.103672	-4.004715	23.612342	0.537878	...	157.050898	157.187292	7.921748	0.009694	7.912271	7.944289	24.0	122.0	merged_anno_0_1_4_p0_chunk_2	merged_anno_0_1_4_p0
2	0.000316	0.000073	0.000136	0.000390	-4.283394	0.326321	-4.834117	-3.735160	23.169319	0.537974	...	157.089436	157.213695	7.916291	0.004669	7.912271	7.928391	48.0	146.0	merged_anno_0_1_4_p0_chunk_3	merged_anno_0_1_4_p0
3	0.000345	0.000029	0.000288	0.000403	-4.014668	0.326350	-4.564561	-3.465605	22.726297	0.538022	...	157.124298	157.237953	7.914763	0.002189	7.912271	7.919865	72.0	170.0	merged_anno_0_1_4_p0_chunk_4	merged_anno_0_1_4_p0
4	0.000396	0.000097	0.000288	0.000649	-3.745942	0.326350	-4.295006	-3.196049	22.283274	0.538022	...	157.156162	157.260390	7.916547	0.004392	7.912271	7.926353	96.0	194.0	merged_anno_0_1_4_p0_chunk_5	merged_anno_0_1_4_p0
5	0.000463	0.000139	0.000288	0.000669	-3.477216	0.326321	-4.025450	-2.926494	21.840251	0.537974	...	157.185195	157.281624	7.920605	0.006376	7.912421	7.931427	120.0	218.0	merged_anno_0_1_4_p0_chunk_6	merged_anno_0_1_4_p0
6	0.000517	0.000134	0.000288	0.000669	-3.208490	0.326263	-3.755895	-2.656938	21.397228	0.537878	...	157.211721	157.303893	7.923801	0.005444	7.914729	7.931427	144.0	242.0	merged_anno_0_1_4_p0_chunk_7	merged_anno_0_1_4_p0
7	0.000596	0.000070	0.000380	0.000669	-2.939764	0.326176	-3.486340	-2.406044	20.954205	0.537735	...	157.236187	157.327390	7.920155	0.012288	7.885567	7.931427	168.0	266.0	merged_anno_0_1_4_p0_chunk_8	merged_anno_0_1_4_p0
8	0.000629	0.000029	0.000576	0.000669	-2.672253	0.324061	-3.216784	-2.142126	20.510529	0.538635	...	157.258711	157.353423	7.906374	0.026949	7.853243	7.931427	192.0	290.0	merged_anno_0_1_4_p0_chunk_9	merged_anno_0_1_4_p0
9	0.000612	0.000038	0.000488	0.000661	-2.419432	0.304439	-2.965890	-1.963964	20.058402	0.553000	...	157.278528	157.370746	7.888315	0.031005	7.848620	7.931427	216.0	314.0	merged_anno_0_1_4_p0_chunk_10	merged_anno_0_1_4_p0
10	0.000570	0.000096	0.000373	0.000661	-2.185570	0.269905	-2.677673	-1.763470	19.591719	0.575306	...	157.302064	157.383235	7.878944	0.023500	7.848620	7.921558	240.0	338.0	merged_anno_0_1_4_p0_chunk_11	merged_anno_0_1_4_p0
11	0.000520	0.000107	0.000373	0.000661	-1.971889	0.240694	-2.424706	-1.589662	19.105494	0.609702	...	157.325592	157.388509	7.893162	0.046331	7.848620	8.008189	264.0	362.0	merged_anno_0_1_4_p0_chunk_12	merged_anno_0_1_4_p0
12	0.000524	0.000115	0.000373	0.000714	-1.779204	0.223516	-2.156786	-1.407064	18.598208	0.628551	...	157.351755	157.390481	7.941962	0.088600	7.848620	8.132930	288.0	386.0	merged_anno_0_1_4_p0_chunk_13	merged_anno_0_1_4_p0
13	0.000518	0.000119	0.000373	0.000714	-1.599161	0.217004	-1.977071	-1.252625	18.074484	0.648964	...	157.369463	157.391156	8.024363	0.122492	7.858972	8.245924	312.0	410.0	merged_anno_0_1_4_p0_chunk_14	merged_anno_0_1_4_p0
14	0.000479	0.000162	0.000197	0.000714	-1.426408	0.203184	-1.777833	-1.089737	17.541551	0.649080	...	157.382657	157.408323	8.126996	0.137761	7.915174	8.367000	336.0	434.0	merged_anno_0_1_4_p0_chunk_15	merged_anno_0_1_4_p0

15 rows × 2080 columns

Now we are ready to train the logistic regression classifier.

References

Wittenburg, Peter, Hennie Brugman, Albert Russel, Alex Klassmann, and Han Sloetjes. 2006. “ELAN: A Professional Framework for Multimodality Research.” Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC 2006).