Source code for snafu.word_properties

import numpy as np
from . import *

[docs] def wordFrequency(subj, missing=0.5, data=None): """ Compute average word frequency for fluency responses. This function loads a word frequency dictionary from a file and computes average frequency scores for each list or participant. It supports both hierarchical and non-hierarchical fluency data. Parameters ---------- subj : list Fluency data. Can be a list of lists (non-hierarchical) or a list of list of lists (hierarchical). missing : float, optional Value to substitute for missing words not found in the frequency dictionary (default is 0.5). data : str Path to a CSV file containing word frequencies. File should have columns: 'word', 'val'. Returns ------- list or tuple For hierarchical data: a tuple of (list of average frequencies per individual, list of excluded words). For non-hierarchical data: a tuple of (list of frequencies per list, list of excluded words). """ # if fluency data are hierarchical, report mean per individual if isinstance(subj[0][0], list): freqs = [] excludeds = [] for l in subj: freq, excluded = wordStat(l, missing=missing, data=data) freqs.append(np.mean(freq)) excludeds.append(flatten_list(excluded)) return freqs, excludeds # if fluency data are non-hierarchical, report mean per list else: freq, excluded = wordStat(subj, missing=missing, data=data) return freq, excluded
[docs] def ageOfAcquisition(subj, missing=None, data=None): """ Compute average age of acquisition (AoA) for fluency responses. This function loads a dictionary of age-of-acquisition scores and computes average values for each list or participant. Parameters ---------- subj : list Fluency data. Can be a list of lists (non-hierarchical) or list of list of lists (hierarchical). missing : float, optional Value to use for words not found in the AoA dictionary. If None, such words are excluded. data : str Path to a CSV file containing AoA scores. File should have columns: 'word', 'val'. Returns ------- list or tuple For hierarchical data: a tuple of (list of average AoA per individual, list of excluded words). For non-hierarchical data: a tuple of (list of AoA scores per list, list of excluded words). """ # if fluency data are hierarchical, report mean per individual if isinstance(subj[0][0], list): aoas = [] excludeds = [] for l in subj: aoa, excluded = wordStat(l, missing=missing, data=data) aoas.append(np.mean(aoa)) excludeds.append(flatten_list(excluded)) return aoas, excludeds # if fluency data are non-hierarchical, report mean per list else: aoa, excluded = wordStat(subj, missing=missing, data=data) return aoa, excluded
[docs] def wordStat(subj, missing=None, data=None): """ Compute word-level statistics (e.g., frequency or AoA) from a word-to-value dictionary. Loads a dictionary mapping words to numeric values (e.g., frequency, AoA), then computes mean values for each list. Handles missing words either by substitution or exclusion. Parameters ---------- subj : list of list of str List(s) of words for which to compute statistics. missing : float, optional Value to substitute for missing words. If None, missing words are excluded from computation. data : str Path to a CSV file with 'word' and 'val' columns. Returns ------- tuple - word_val : list of float Mean value for each list (e.g., frequency or AoA). - words_excluded : list of list of str Words not found in the dictionary for each list. """ # load dictionary d_val = {} with open(data, 'rt', encoding='utf-8-sig') as csvfile: # allows comments in file thanks to https://stackoverflow.com/a/14158869/353278 reader = csv.DictReader(filter(lambda row: row[0]!='#', csvfile), fieldnames=['word','val']) for row in reader: d_val[row['word']]= float(row['val']) word_val = [] words_excluded = [] for i in subj: # each list temp=[] excluded=[] for j in i: # each word if (j in d_val): # word must be in the list temp.append(d_val[j]) else: # or their would be excluded if (missing!=None): # case 2: not in the list, substituted by missing temp.append(missing) else: excluded.append(j) if(len(temp)>0): word_val.append(np.mean(temp)) words_excluded.append(excluded) return word_val, words_excluded