Source code for snafu.word_properties

import numpy as np
from . import *


[docs]
def wordFrequency(subj, missing=0.5, data=None):
    """
    Compute average word frequency for fluency responses.

    This function loads a word frequency dictionary from a file and computes
    average frequency scores for each list or participant. It supports both
    hierarchical and non-hierarchical fluency data.

    Parameters
    ----------
    subj : list
        Fluency data. Can be a list of lists (non-hierarchical) or a list of list of lists (hierarchical).
    missing : float, optional
        Value to substitute for missing words not found in the frequency dictionary (default is 0.5).
    data : str
        Path to a CSV file containing word frequencies. File should have columns: 'word', 'val'.

    Returns
    -------
    list or tuple
        For hierarchical data: a tuple of (list of average frequencies per individual, list of excluded words).
        For non-hierarchical data: a tuple of (list of frequencies per list, list of excluded words).
    """
    # if fluency data are hierarchical, report mean per individual
    if isinstance(subj[0][0], list):
        freqs = []
        excludeds = []
        for l in subj:
            freq, excluded = wordStat(l, missing=missing, data=data)
            freqs.append(np.mean(freq))
            excludeds.append(flatten_list(excluded))
        return freqs, excludeds

    # if fluency data are non-hierarchical, report mean per list
    else:
        freq, excluded = wordStat(subj, missing=missing, data=data)
        return freq, excluded



[docs]
def ageOfAcquisition(subj, missing=None, data=None):
    """
    Compute average age of acquisition (AoA) for fluency responses.

    This function loads a dictionary of age-of-acquisition scores and computes
    average values for each list or participant.

    Parameters
    ----------
    subj : list
        Fluency data. Can be a list of lists (non-hierarchical) or list of list of lists (hierarchical).
    missing : float, optional
        Value to use for words not found in the AoA dictionary. If None, such words are excluded.
    data : str
        Path to a CSV file containing AoA scores. File should have columns: 'word', 'val'.

    Returns
    -------
    list or tuple
        For hierarchical data: a tuple of (list of average AoA per individual, list of excluded words).
        For non-hierarchical data: a tuple of (list of AoA scores per list, list of excluded words).
    """
    # if fluency data are hierarchical, report mean per individual
    if isinstance(subj[0][0], list):
        aoas = []
        excludeds = []
        for l in subj:
            aoa, excluded = wordStat(l, missing=missing, data=data)
            aoas.append(np.mean(aoa))
            excludeds.append(flatten_list(excluded))
        return aoas, excludeds
    # if fluency data are non-hierarchical, report mean per list
    else:
        aoa, excluded = wordStat(subj, missing=missing, data=data)
        return aoa, excluded



[docs]
def wordStat(subj, missing=None, data=None):
    """
    Compute word-level statistics (e.g., frequency or AoA) from a word-to-value dictionary.

    Loads a dictionary mapping words to numeric values (e.g., frequency, AoA), then computes
    mean values for each list. Handles missing words either by substitution or exclusion.

    Parameters
    ----------
    subj : list of list of str
        List(s) of words for which to compute statistics.
    missing : float, optional
        Value to substitute for missing words. If None, missing words are excluded from computation.
    data : str
        Path to a CSV file with 'word' and 'val' columns.

    Returns
    -------
    tuple
        - word_val : list of float
            Mean value for each list (e.g., frequency or AoA).
        - words_excluded : list of list of str
            Words not found in the dictionary for each list.
    """
    # load dictionary
    d_val = {}
    with open(data, 'rt', encoding='utf-8-sig') as csvfile:
        # allows comments in file thanks to https://stackoverflow.com/a/14158869/353278
        reader = csv.DictReader(filter(lambda row: row[0]!='#', csvfile), fieldnames=['word','val'])
        for row in reader:
            d_val[row['word']]= float(row['val'])

    word_val = []
    words_excluded = []
    for i in subj: # each list
        temp=[]
        excluded=[]
        for j in i: # each word
            if (j in d_val): # word must be in the list
                temp.append(d_val[j])
            else: # or their would be excluded
                if (missing!=None): # case 2: not in the list, substituted by missing
                    temp.append(missing)
                else:
                    excluded.append(j)
        if(len(temp)>0):
            word_val.append(np.mean(temp))
        words_excluded.append(excluded)
    
    return word_val, words_excluded