Movement annotation I: Preparing training data and data for classifier

Overview

Since we have around 9000 trials in the final dataset, it is not feasible to manually annotate the movement onset and offset for each trial. Instead, we will use a simple logistic regression model to predict the movement onset and offset from all the movement features we have collected into the merge dataset.

We have annotated the movement onset and offset in Wittenburg et al. (2006) for a pilot data (dyad 0). Two annotators have independently annotated the movement onset and offset for four tiers:

- upper body
- lower body
- arms
- head

Parent tier ‘movement’ summarizes overal movement across all tiers.

Now, we will use these ground truth annotations to create a training set for the logistic regression model.

Code to prepare the environment
# Packages
import os
import glob
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import glob


curfolder = os.getcwd()

# Here we store our timeseries data
processedfolder = os.path.join(curfolder + '\\..\\03_TS_processing\\TS_merged\\')
processedfiles = glob.glob(processedfolder + '\\merged*.csv')
processedfiles = [x for x in processedfiles if 'anno' not in x]

# Here we will store the training data
datasetfolder = os.path.join(curfolder + '\\TrainingData\\')

# Here we store the data ready to classify
chunked_folder = os.path.join(curfolder + '\\TS_forClassifying\\')

Preparing manual annotations

Our annotators are annotating only movement, so first we need to also fill in the missing space by nomovement values .

Custom functions
# Function to add no-movement annotations to the ELAN file
def add_nomovement_annotations(xml_file_path, newfilepath):
    # Load the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Extract all time slots
    time_slots = {}
    for time_slot in root.find('TIME_ORDER').findall('TIME_SLOT'):
        time_slots[time_slot.attrib['TIME_SLOT_ID']] = int(time_slot.attrib['TIME_VALUE'])

    # Sort time slots by TIME_VALUE
    sorted_time_slots = sorted(time_slots.items(), key=lambda x: x[1])
    time_slot_ids = [ts[0] for ts in sorted_time_slots]
    time_values = [ts[1] for ts in sorted_time_slots]

    # Loop over all tiers
    for tier in root.findall('TIER'):
        annotations = tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION')

        if not annotations:
            # If no annotations exist, add a single 'nomovement' annotation covering the whole tier
            new_annotation = ET.Element('ANNOTATION')
            alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
            alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
            alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
            annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
            annotation_value.text = 'nomovement'
            tier.append(new_annotation)
        else:
            # Sort annotations by start time
            sorted_annotations = sorted(annotations, key=lambda x: time_slots[x.attrib['TIME_SLOT_REF1']])
            
            # Handle the first annotation
            first_annotation = sorted_annotations[0]
            first_start_time = time_slots[first_annotation.attrib['TIME_SLOT_REF1']]
            if first_start_time > time_values[0]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', time_slot_ids[0])
                alignable_annotation.set('TIME_SLOT_REF2', first_annotation.attrib['TIME_SLOT_REF1'])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

            # Handle gaps between annotations
            for i in range(len(sorted_annotations) - 1):
                current_annotation = sorted_annotations[i]
                next_annotation = sorted_annotations[i + 1]
                current_end_time = time_slots[current_annotation.attrib['TIME_SLOT_REF2']]
                next_start_time = time_slots[next_annotation.attrib['TIME_SLOT_REF1']]
                if current_end_time < next_start_time:
                    new_annotation = ET.Element('ANNOTATION')
                    alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                    alignable_annotation.set('TIME_SLOT_REF1', current_annotation.attrib['TIME_SLOT_REF2'])
                    alignable_annotation.set('TIME_SLOT_REF2', next_annotation.attrib['TIME_SLOT_REF1'])
                    annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                    annotation_value.text = 'nomovement'
                    tier.append(new_annotation)

            # Handle the last annotation
            last_annotation = sorted_annotations[-1]
            last_end_time = time_slots[last_annotation.attrib['TIME_SLOT_REF2']]
            if last_end_time < time_values[-1]:
                new_annotation = ET.Element('ANNOTATION')
                alignable_annotation = ET.SubElement(new_annotation, 'ALIGNABLE_ANNOTATION')
                alignable_annotation.set('TIME_SLOT_REF1', last_annotation.attrib['TIME_SLOT_REF2'])
                alignable_annotation.set('TIME_SLOT_REF2', time_slot_ids[-1])
                annotation_value = ET.SubElement(alignable_annotation, 'ANNOTATION_VALUE')
                annotation_value.text = 'nomovement'
                tier.append(new_annotation)

    # Save the modified XML file as a new file
    tree.write(newfilepath, encoding='UTF-8', xml_declaration=True)
manualanno_folder_r1 = curfolder + '/ManualAnno/R1/'            # Annotator 1 (AC)
manualanno_folder_r3 = curfolder + '/ManualAnno/R3/'            # Annotator 2 (GR)

manualannofiles1 = glob.glob(manualanno_folder_r1 + '/*.eaf')
# get rid of those with ELAN_tiers.eaf
manualannofiles1 = [x for x in manualannofiles1 if 'ELAN_tiers' not in x]
manualannofiles3 = glob.glob(manualanno_folder_r3 + '/*.eaf')
# get rid of those with ELAN_tiers.eaf
manualannofiles3 = [x for x in manualannofiles3 if 'ELAN_tiers' not in x]


for file in manualannofiles3:
    print('working on ' + file)

    # New filename is without third part of the name
    newfile = file.split('\\')[-1]
    chunks = newfile.split('_')
    if 'corrected' in file:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-4])
        else:
            newfile = '_'.join(chunks[:-3])
    else:
        if 'c0' in file or 'c1' in file or 'c2' in file:
            newfile = '_'.join(chunks[:-3])
        else:
            newfile = '_'.join(chunks[:-2]) 

    newfile = newfile.replace('trial_', '')
    
    # Save it again
    newfile = manualanno_folder_r3 + newfile + '_ELAN_tiers.eaf'

    add_nomovement_annotations(file, newfile)

Now, we need to get the manual annotation from ELAN to simple text file so that we can merge the timeseries we prepared in merging script with information about movement in the trial

Custom functions
# Function to parse elan file
def parse_eaf_file(eaf_file, rel_tiers):
    tree = ET.parse(eaf_file)
    root = tree.getroot()

    time_order = root.find('TIME_ORDER')
    time_slots = {time_slot.attrib['TIME_SLOT_ID']: time_slot.attrib['TIME_VALUE'] for time_slot in time_order}

    annotations = []
    relevant_tiers = {rel_tiers}

    for tier in root.findall('TIER'):
        tier_id = tier.attrib['TIER_ID']
        if tier_id in relevant_tiers:
            for annotation in tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION'):
                # Ensure required attributes are present
                if 'TIME_SLOT_REF1' in annotation.attrib and 'TIME_SLOT_REF2' in annotation.attrib:
                    ts_ref1 = annotation.attrib['TIME_SLOT_REF1']
                    ts_ref2 = annotation.attrib['TIME_SLOT_REF2']
                    # Get annotation ID if it exists, otherwise set to None
                    ann_id = annotation.attrib.get('ANNOTATION_ID', None)
                    annotation_value = annotation.find('ANNOTATION_VALUE').text.strip()
                    annotations.append({
                        'tier_id': tier_id,
                        'annotation_id': ann_id,
                        'start_time': time_slots[ts_ref1],
                        'end_time': time_slots[ts_ref2],
                        'annotation_value': annotation_value
                    })

    return annotations

# Function to load annotations into csv
def fillAnno(TSfile, ANNOfile, colname):
    TSfile[colname] = None
    for row in ANNOfile.iterrows():
        start = row[1][0]
        end = row[1][1]
        TSfile.loc[(TSfile['time'] >= start) & (TSfile['time'] <= end), colname] = row[1][2]

Now we create text file for each tier separately, saving only the start time, end time and value (movement or nomovement) for each annotation file.

# These are the manual annotations adapted with no-movement annotations
annofolder_manu = os.path.join(curfolder + '\\ManualAnno\\R1\\')
annofiles_manu = glob.glob(annofolder_manu + '*ELAN_tiers.eaf')

################
#### arms ######
################

arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'

with open(arms_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'arms')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### upper body####
###################

upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'

with open(upperbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'upper_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
#### lower body####
###################

lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'

with open(lowerbody_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'lower_body')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

###################
##### head ########
###################

head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

with open(head_anno, 'w') as f:
    for file in annofiles_manu:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'head_mov')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

Preparing data for classifier

In the following code, we merge the annotations with our merged files so that we can later sample from the data based on the annotations.

We will now also filter some superfluous information, as well as add some more such as:

  • distance of LIndex to RIndex
  • distance of Wrist to Hip
  • distance of Head to Hip
  • distance of Head to Ankle
# These are the annotations per tier that we just created from manual annotations
arms_anno = curfolder + '/annotations_groundTruth/arms_annotations.txt'
upperbody_anno = curfolder + '/annotations_groundTruth/upperbody_annotations.txt'
lowerbody_anno = curfolder + '/annotations_groundTruth/lowerbody_annotations.txt'
head_anno = curfolder + '/annotations_groundTruth/head_annotations.txt'

for file in processedfiles:

    # TrialID
    trialid = file.split('\\')[-1].split('.')[0]
    trialid = trialid.replace('merged_', '')

    print('working on ' + trialid)

    # Load the merged file
    merged = pd.read_csv(file)
    
    # Load the annotations as df
    arms = pd.read_csv(arms_anno, sep='\t', header=None)
    ub = pd.read_csv(upperbody_anno, sep='\t', header=None)
    lb = pd.read_csv(lowerbody_anno, sep='\t', header=None)
    head = pd.read_csv(head_anno, sep='\t', header=None)

    annos = [arms, ub, lb, head]

    practice = False

    # Loop over each tier and fill values into timeseries
    for anno_df in annos:
        # Get the annotations for the trialid
        anno_trial = anno_df[anno_df[3] == trialid] 
        
        if anno_trial.empty:
            print('no annotations for ' + trialid)  # This will be the case of practice trials that were not annotated
            practice = True
            continue
        
        else:
            if anno_df.equals(arms):
                fillAnno(merged, anno_trial, 'arms')
            elif anno_df.equals(ub):
                fillAnno(merged, anno_trial, 'upper_body')
            elif anno_df.equals(lb):
                fillAnno(merged, anno_trial, 'lower_body')
            elif anno_df.equals(head):
                fillAnno(merged, anno_trial, 'head_mov')
            else:
                print('something went wrong')

    if practice:
        continue

    df = merged.copy()

    # Now we will also add some features that might be relevant for the classifier
    ## RWrist to LWrist in all dimensions
    df['wristDistance_x'] = df['RWrist_x'] - df['LWrist_x']
    df['wristDistance_y'] = df['RWrist_y'] - df['LWrist_y']
    df['wristDistance_z'] = df['RWrist_z'] - df['LWrist_z']

    ## RWrist to RHip
    df['RwristRhipDistance_x'] = df['RWrist_x'] - df['RHip_x']
    df['RwristRhipDistance_y'] = df['RWrist_y'] - df['RHip_y']
    df['RwristRhipDistance_z'] = df['RWrist_z'] - df['RHip_z']

    ## RWrist to LHip
    df['RwristLhipDistance_x'] = df['RWrist_x'] - df['LHip_x']
    df['RwristLhipDistance_y'] = df['RWrist_y'] - df['LHip_y']
    df['RwristLhipDistance_z'] = df['RWrist_z'] - df['LHip_z']

    ## LWrist to LHip
    df['LwristLhipDistance_x'] = df['LWrist_x'] - df['LHip_x']
    df['LwristLhipDistance_y'] = df['LWrist_y'] - df['LHip_y']
    df['LwristLhipDistance_z'] = df['LWrist_z'] - df['LHip_z']

    ## LWrist to RHip
    df['LwristRhipDistance_x'] = df['LWrist_x'] - df['RHip_x']
    df['LwristRhipDistance_y'] = df['LWrist_y'] - df['RHip_y']
    df['LwristRhipDistance_z'] = df['LWrist_z'] - df['RHip_z']

    ## Head to RHip
    df['HeadRhipDistance_x'] = df['Head_x'] - df['RHip_x']
    df['HeadRhipDistance_y'] = df['Head_y'] - df['RHip_y']
    df['HeadRhipDistance_z'] = df['Head_z'] - df['RHip_z']

    ## Head to RAnkle
    df['HeadRankleDistance_x'] = df['Head_x'] - df['RAnkle_x']
    df['HeadRankleDistance_y'] = df['Head_y'] - df['RAnkle_y']
    df['HeadRankleDistance_z'] = df['Head_z'] - df['RAnkle_z']


    # Get rid of superfluous columns
    df = df.drop(columns=['left_back', 'right_forward', 'right_back', 'left_forward', 'COPXc', 'COPYc', 'FileInfo'])

    # And we also don't need vocal features
    cols = df.columns
    colstodrop = ['envelope', 'audio', 'envelope_change', 'audio', 'f0', 'f1', 'f2', 'f3', 'env_', 'CoG']
    newcols = [col for col in cols if not any(x in col for x in colstodrop)]
    df = df[newcols]   
                
    # Save it
    df.to_csv(curfolder + '\\TS_annotated\\merged_anno_' + trialid + '.csv', index=False)

Now we are ready to create the training set for the logistic regression model.

Sumarizing features for training dataset

Now we will sample windows from movement and nomovement for each tier and summarize the available features in terms of mean, sd, min and max. Note that we will not create training data for each tier separately. Sometimes, it may be useful to predict movement of a specific body part with the information about other body part.

Each tier varies in the length of movement and non-movement chunk, but we will proceed in uniform way, setting threshold of 50 rows, i.e., 100 ms. In such a short period of time, it is anyway difficult to initiate any meaningful movement in any of the tiers of interest - head, arms, upper body, lower body.

We will sample the windows randomly, but also make sure there is enough border cases (i.e., windows that capture end or beginning of the movement). Our participants are ‘locking’ hands in the beginning and in the end of each performance. Our classifier should know that these are not ‘communicative’ movements per se.

(Note that this code takes a while to execute.)

Custom functions
# Function to sample random consecutive rows from a df
def select_random_consecutive_rows(df, change_col, threshold):
    # Group the DataFrame by the 'change' column
    grouped = df.groupby(change_col)
    
    # List to hold the selected rows
    selected_rows = []

    # Loop over each group
    for group_df in grouped:
        # get the first index
        idx_start = group_df[1].index[0]
        # get the last index
        idx_last = group_df[1].index[-1]
        # Check if the group is large enough to select 'threshold' rows
        if len(group_df[1]) >= threshold:
            # Randomly choose a starting index for consecutive selection that is within the index range of the group
            start_idx = np.random.randint(idx_start, idx_last - threshold + 1)
            # Select consecutive rows from that start index
            selected = df.loc[start_idx:start_idx + threshold - 1]
            selected_rows.append(selected)
    
    # Concatenate all selected rows into a single DataFrame
    result_df = pd.concat(selected_rows)
    
    return result_df

# Transforming the dictionary into a df
def dict_to_df(data):
    # Flatten the dictionary into a format with keys like 'feature_mean', 'feature_std', etc.
    flat_data = {}
    for feature, stats in data.items():
        for stat, value in stats.items():
            flat_data[f'{feature}_{stat}'] = value

    # Convert the flat dictionary to a DataFrame with a single row
    df = pd.DataFrame(flat_data, index=[0])
    
    return df
# Set seed for reproducibility
np.random.seed(42)

# These are our timeseries to be sampled from
samplingfolder = os.path.join(curfolder + '/TS_annotated/')
samplingfiles = glob.glob(samplingfolder + '*.csv')

tiers = ['arms', 'upper_body', 'lower_body', 'head_mov']
threshold_m = 50 # threshold for movement (100 ms)
threshold_nm = 50 # threshold for no movement

for tierofinterest in tiers:
    dataset_features = pd.DataFrame()
    summaries_m = {}
    summaries_nm = {}

    counter = 1

    for file in samplingfiles:
        df = pd.read_csv(file)

        # If the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
        if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
            print('skipping ' + file)
            continue

        # TrialID
        trialid = file.split('\\')[-1].split('.')[0]

        # Annotate unique movement/no movement chunks
        df['row_change'] = df[tierofinterest].ne(df[tierofinterest].shift()).cumsum()

        # Sample random 5 samples of the threshold length in both movement and no movement in tier
        tier_m = df[df[tierofinterest] == 'movement']
        tier_nm = df[df[tierofinterest] == 'nomovement']

        if not tier_m.empty:
            # 10 samples
            for i in range(10):
                tier_m_sample = select_random_consecutive_rows(tier_m, 'row_change', threshold_m)

                # Get summaries for numerical columns
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # Get stats and save them to dictionary
                    stats = tier_m_sample[col].describe().to_dict()
                    summaries_m[col] = stats

                # Dictionary to df row
                summary_row_m = dict_to_df(summaries_m)
                # We don't need count stats
                summary_row_m = summary_row_m.loc[:, ~summary_row_m.columns.str.contains('count|%', regex=True)]
                # Add metainfo
                summary_row_m['trialid'] = trialid
                summary_row_m['eventid'] = trialid + '_mov_' + str(counter)
                summary_row_m['anno_value'] = 'movement'

                # Add row to the main df
                dataset_features = pd.concat([dataset_features, summary_row_m])
                counter += 1
            
        counter = 1

        if not tier_nm.empty:
            for i in range(10):
                tier_nm_sample = select_random_consecutive_rows(tier_nm, 'row_change', threshold_nm)
                # Get summaries for numerical columns
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                for col in num_cols:
                    # Get stats and save them to dictionary
                    stats = tier_nm_sample[col].describe().to_dict()
                    summaries_nm[col] = stats

                # Dictionary to df row
                summary_row_nm = dict_to_df(summaries_nm)
                summary_row_nm = summary_row_nm.loc[:, ~summary_row_nm.columns.str.contains('count|%', regex=True)]

                # Add metainfo
                summary_row_nm['trialid'] = trialid
                summary_row_nm['eventid'] = trialid + '_nonmov_' + str(counter)
                summary_row_nm['anno_value'] = 'nomovement'

                # Add row to the main df
                dataset_features = pd.concat([dataset_features, summary_row_nm])
                counter += 1

        counter = 1

        ###################### Process border windows

        border_rows = []

        # Identify the rows where the tierofinterest changes
        change_points = df[df['row_change'].diff().abs() > 0].index

        for idx in change_points:
            # Get the window before the change
            before_start = max(0, idx - 25)  # Ensure no negative index
            before_end = idx  # Up to the change point
            before_window = df.iloc[before_start:before_end]
            # Get the annotation value
            anno_value = df.loc[idx, tierofinterest]

            # Get the window after the change
            after_start = idx
            after_end = min(len(df), idx + 25)  # Ensure no index exceeds the DataFrame length
            after_window = df.iloc[after_start:after_end]

            # Process the 'before' window in the same way like classic chunks above
            if not before_window.empty:
                num_cols = df.select_dtypes(include=np.number).columns
                num_cols = [x for x in num_cols if x not in ['time', 'row_change']]
                summaries_before = {col: before_window[col].describe().to_dict() for col in num_cols}
                summary_row_before = dict_to_df(summaries_before)
                summary_row_before = summary_row_before.loc[:, ~summary_row_before.columns.str.contains('count|%', regex=True)]
                summary_row_before['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_before['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_before['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_before['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_before])
                counter += 1

            # Process the 'after' window in the same way like classic chunks above
            if not after_window.empty:
                summaries_after = {col: after_window[col].describe().to_dict() for col in num_cols}
                summary_row_after = dict_to_df(summaries_after)
                summary_row_after = summary_row_after.loc[:, ~summary_row_after.columns.str.contains('count|%', regex=True)]
                summary_row_after['trialid'] = trialid
                if anno_value == 'movement':
                    summary_row_after['eventid'] = f"{trialid}_border_mov_{counter}"
                else:
                    summary_row_after['eventid'] = f"{trialid}_border_nonmov_{counter}"
                summary_row_after['anno_value'] = anno_value
                dataset_features = pd.concat([dataset_features, summary_row_after])
                counter += 1

    # Drop all columns with NaN values
    dataset_features = dataset_features.dropna(axis=1)
    # Save it
    filename = '\\dataset_' + tierofinterest + '_features.csv'
    dataset_features.to_csv(datasetfolder + filename, index=False)
        
        

This is how the dataset looks like

COPc_mean COPc_std COPc_min COPc_max pelvis_tilt_moment_mean pelvis_tilt_moment_std pelvis_tilt_moment_min pelvis_tilt_moment_max pelvis_list_moment_mean pelvis_list_moment_std ... HeadRankleDistance_y_std HeadRankleDistance_y_min HeadRankleDistance_y_max HeadRankleDistance_z_mean HeadRankleDistance_z_std HeadRankleDistance_z_min HeadRankleDistance_z_max trialid eventid anno_value
0 0.001029 0.000353 0.000497 0.001539 11.916939 0.523783 10.259013 12.440273 40.593621 1.214134 ... 0.388264 -47.812929 -46.498456 142.714523 0.264402 142.303580 143.190783 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_1 movement
1 0.002310 0.000357 0.001384 0.002592 -5.251434 6.670124 -14.455521 6.433171 -43.388846 11.359658 ... 0.951654 -46.226361 -43.045054 147.654005 0.819087 146.352625 149.044777 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_2 movement
2 0.001229 0.000654 0.000325 0.002082 4.686266 1.693475 0.823981 6.247819 24.678865 12.110897 ... 0.668766 -50.287726 -48.094340 139.471133 0.828517 138.035172 140.793319 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_3 movement
3 0.010437 0.003827 0.005062 0.016617 9.534314 4.984207 1.780355 16.748078 -22.117817 12.681626 ... 0.335743 -48.799024 -47.725756 143.580503 0.088882 143.457698 143.710256 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_4 movement
4 0.001004 0.000326 0.000497 0.001522 11.714350 0.759333 9.564795 12.252050 40.096545 1.999604 ... 0.406749 -47.938470 -46.575235 142.787307 0.272442 142.360696 143.261214 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_5 movement
5 0.001461 0.000432 0.000692 0.002762 13.462955 1.314506 12.131364 15.747833 48.942522 10.038141 ... 0.266764 -46.886244 -46.003796 142.217696 0.202835 141.921413 142.578043 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_6 movement
6 0.005301 0.000282 0.004481 0.005716 -9.424995 13.097252 -24.188497 14.602104 -123.839005 12.716710 ... 0.598752 -26.614199 -24.623861 159.472153 0.330003 158.899835 160.012364 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_7 movement
7 0.004990 0.000920 0.003090 0.007364 12.518248 2.241052 9.648455 15.750968 87.260236 6.093794 ... 0.132448 -45.982839 -45.587361 141.964505 0.093210 141.876556 142.181769 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_8 movement
8 0.003358 0.000680 0.002573 0.004292 14.285585 8.257143 -3.787133 23.009216 4.935150 15.441791 ... 0.914203 -31.859550 -28.740297 156.858607 0.546987 155.927757 157.763944 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_9 movement
9 0.000474 0.000222 0.000257 0.001104 7.607259 1.802191 3.210280 9.359289 -80.437708 8.600917 ... 0.621470 -31.547830 -29.416171 158.248435 0.507886 157.346089 159.038608 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_mov_10 movement
10 0.000327 0.000172 0.000081 0.000551 1.142706 2.774018 -2.489563 6.568694 -59.022974 8.534245 ... 0.808444 -22.051561 -20.066378 159.953388 0.388046 159.529543 160.445080 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_nonmov_1 nomovement
11 0.000859 0.000627 0.000175 0.001681 1.956649 2.745580 -1.319160 6.667728 -60.887893 1.757583 ... 0.224639 -21.057542 -20.421462 160.343627 0.493233 159.797253 160.919489 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_nonmov_2 nomovement
12 0.000508 0.000227 0.000243 0.001041 1.658757 0.735249 0.017040 3.188685 -67.195356 6.031778 ... 2.212455 -25.605054 -20.390822 160.404137 0.263193 159.981512 160.727456 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_nonmov_3 nomovement
13 0.000446 0.000291 0.000050 0.001093 2.139294 1.726763 -0.846748 6.088638 -62.201977 2.714513 ... 0.336568 -21.325884 -20.390822 160.209853 0.453761 159.732042 160.736602 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_nonmov_4 nomovement
14 0.000276 0.000111 0.000125 0.000471 1.090214 1.638015 -1.945115 3.100118 -63.428529 5.286197 ... 1.425956 -24.065452 -20.488453 160.027971 0.174932 159.755498 160.246507 merged_anno_0_1_44_p0 merged_anno_0_1_44_p0_nonmov_5 nomovement

15 rows × 1263 columns

Preparing timeseries for classifying

Now we also prepare data for the classifier. We will use the same features as in the previous step, but now we will summarize whole timeseries from beginning to the end. Every time, we take 100ms chunk, and slide with 25ms step, such that we have overlap between the chunks and can later assess more accurately when exactly is the movement onsent/offset.

Custom functions
# Function to summarize every 50 rows with overlapping intervals, sliding by 12 rows
def summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12):
    summary_df = pd.DataFrame()
    counter = 1

    for start_idx in range(0, len(df), slide_step):
        # Select a slice of 50 rows (or fewer for the last chunk)
        selected = df.iloc[start_idx:start_idx + summary_interval]
        
        # Stop if there are no more rows to process
        if selected.empty:
            break
            
        summary_stats = {}

        # Calculate statistics for each numerical column
        for col in num_cols:
            stats = selected[col].describe().to_dict()
            summary_stats[col] = stats

        # Convert to DataFrame row format
        summary_row = dict_to_df(summary_stats)

        # Add start and end time for the chunk
        summary_row['start_time'] = selected['time'].iloc[0]
        summary_row['end_time'] = selected['time'].iloc[-1]

        # Add chunk number
        summary_row['eventid'] = f"{trialid}_chunk_{counter}"

        # Get rid of all columns that contain 'count' or '%' in the name
        summary_row = summary_row.loc[:, ~summary_row.columns.str.contains('count|%', regex=True)]

        # Append to the main DataFrame
        summary_df = pd.concat([summary_df, summary_row], ignore_index=True)

        counter += 1

        # If the selected already contains time of the last row, finish
        if selected['time'].iloc[-1] == df['time'].iloc[-1]:
            return summary_df

    return summary_df
# Main df to store all summaries
summary_df = pd.DataFrame()

for file in samplingfiles:

    # TrialID
    trialid = file.split('\\')[-1].split('.')[0]

    df = pd.read_csv(file)

    # If the df doesn't have columns arms, upper_body, lower_body, head_mov, skip it
    if 'arms' not in df.columns or 'upper_body' not in df.columns or 'lower_body' not in df.columns or 'head_mov' not in df.columns:
        print('skipping ' + trialid)
        continue
    else:
        print('working on ' + trialid)

    # Define numerical columns (excluding 'time' and 'change' if present)
    num_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'change' and col != 'time']

    # Summarize data in intervals of 50 rows, sliding by 12 rows
    summary_df = summarize_consecutive_rows(df, trialid, num_cols, summary_interval=50, slide_step=12)

    # Add trial ID 
    summary_df['trialid'] = trialid

    # Save it
    summary_df.to_csv(chunked_folder + trialid + '_chunked.csv', index=False)

print('All done, now we can proceed with annotation with our classifier')

This is an example file that contains timeseries processed into chunks

COPc_mean COPc_std COPc_min COPc_max pelvis_tilt_moment_mean pelvis_tilt_moment_std pelvis_tilt_moment_min pelvis_tilt_moment_max pelvis_list_moment_mean pelvis_list_moment_std ... HeadRankleDistance_y_min HeadRankleDistance_y_max HeadRankleDistance_z_mean HeadRankleDistance_z_std HeadRankleDistance_z_min HeadRankleDistance_z_max start_time end_time eventid trialid
0 0.000209 0.000104 0.000023 0.000360 -4.822713 0.329021 -5.373228 -4.274271 24.058442 0.542426 ... 157.008942 157.158540 7.932183 0.016104 7.913146 7.966076 0.0 98.0 merged_anno_0_1_4_p0_chunk_1 merged_anno_0_1_4_p0
1 0.000264 0.000098 0.000118 0.000380 -4.552120 0.326263 -5.103672 -4.004715 23.612342 0.537878 ... 157.050898 157.187292 7.921748 0.009694 7.912271 7.944289 24.0 122.0 merged_anno_0_1_4_p0_chunk_2 merged_anno_0_1_4_p0
2 0.000316 0.000073 0.000136 0.000390 -4.283394 0.326321 -4.834117 -3.735160 23.169319 0.537974 ... 157.089436 157.213695 7.916291 0.004669 7.912271 7.928391 48.0 146.0 merged_anno_0_1_4_p0_chunk_3 merged_anno_0_1_4_p0
3 0.000345 0.000029 0.000288 0.000403 -4.014668 0.326350 -4.564561 -3.465605 22.726297 0.538022 ... 157.124298 157.237953 7.914763 0.002189 7.912271 7.919865 72.0 170.0 merged_anno_0_1_4_p0_chunk_4 merged_anno_0_1_4_p0
4 0.000396 0.000097 0.000288 0.000649 -3.745942 0.326350 -4.295006 -3.196049 22.283274 0.538022 ... 157.156162 157.260390 7.916547 0.004392 7.912271 7.926353 96.0 194.0 merged_anno_0_1_4_p0_chunk_5 merged_anno_0_1_4_p0
5 0.000463 0.000139 0.000288 0.000669 -3.477216 0.326321 -4.025450 -2.926494 21.840251 0.537974 ... 157.185195 157.281624 7.920605 0.006376 7.912421 7.931427 120.0 218.0 merged_anno_0_1_4_p0_chunk_6 merged_anno_0_1_4_p0
6 0.000517 0.000134 0.000288 0.000669 -3.208490 0.326263 -3.755895 -2.656938 21.397228 0.537878 ... 157.211721 157.303893 7.923801 0.005444 7.914729 7.931427 144.0 242.0 merged_anno_0_1_4_p0_chunk_7 merged_anno_0_1_4_p0
7 0.000596 0.000070 0.000380 0.000669 -2.939764 0.326176 -3.486340 -2.406044 20.954205 0.537735 ... 157.236187 157.327390 7.920155 0.012288 7.885567 7.931427 168.0 266.0 merged_anno_0_1_4_p0_chunk_8 merged_anno_0_1_4_p0
8 0.000629 0.000029 0.000576 0.000669 -2.672253 0.324061 -3.216784 -2.142126 20.510529 0.538635 ... 157.258711 157.353423 7.906374 0.026949 7.853243 7.931427 192.0 290.0 merged_anno_0_1_4_p0_chunk_9 merged_anno_0_1_4_p0
9 0.000612 0.000038 0.000488 0.000661 -2.419432 0.304439 -2.965890 -1.963964 20.058402 0.553000 ... 157.278528 157.370746 7.888315 0.031005 7.848620 7.931427 216.0 314.0 merged_anno_0_1_4_p0_chunk_10 merged_anno_0_1_4_p0
10 0.000570 0.000096 0.000373 0.000661 -2.185570 0.269905 -2.677673 -1.763470 19.591719 0.575306 ... 157.302064 157.383235 7.878944 0.023500 7.848620 7.921558 240.0 338.0 merged_anno_0_1_4_p0_chunk_11 merged_anno_0_1_4_p0
11 0.000520 0.000107 0.000373 0.000661 -1.971889 0.240694 -2.424706 -1.589662 19.105494 0.609702 ... 157.325592 157.388509 7.893162 0.046331 7.848620 8.008189 264.0 362.0 merged_anno_0_1_4_p0_chunk_12 merged_anno_0_1_4_p0
12 0.000524 0.000115 0.000373 0.000714 -1.779204 0.223516 -2.156786 -1.407064 18.598208 0.628551 ... 157.351755 157.390481 7.941962 0.088600 7.848620 8.132930 288.0 386.0 merged_anno_0_1_4_p0_chunk_13 merged_anno_0_1_4_p0
13 0.000518 0.000119 0.000373 0.000714 -1.599161 0.217004 -1.977071 -1.252625 18.074484 0.648964 ... 157.369463 157.391156 8.024363 0.122492 7.858972 8.245924 312.0 410.0 merged_anno_0_1_4_p0_chunk_14 merged_anno_0_1_4_p0
14 0.000479 0.000162 0.000197 0.000714 -1.426408 0.203184 -1.777833 -1.089737 17.541551 0.649080 ... 157.382657 157.408323 8.126996 0.137761 7.915174 8.367000 336.0 434.0 merged_anno_0_1_4_p0_chunk_15 merged_anno_0_1_4_p0

15 rows × 2080 columns


Now we are ready to train the logistic regression classifier.

References

Wittenburg, Peter, Hennie Brugman, Albert Russel, Alex Klassmann, and Han Sloetjes. 2006. ELAN: A Professional Framework for Multimodality Research.” Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC 2006).