Source code for snafu.clustering

from . import *

[docs] def clusterSize(fluency_lists, scheme, clustertype='fluid'): """ Calculate average cluster size of a fluency list (or list of fluency lists). This function expects a list of lists. If you want to calculate the average cluster size of a single list, you can wrap it in another list, e.g., [fluency_list] Parameters ---------- fluency_lists : list A list of fluency lists, e.g., fluencydata.labeledlists scheme : str or int For semantic fluency data, specify a path indicating clustering scheme (.csv) to use. For letter fluency data, specify an in integer indicating the number of initial letters to use as clusters (e.g., 2) clustertype : str, optional Type of clustering to apply. Default is 'fluid'. The other option is 'static'. Returns ------- list of float A list containing the average cluster size in each fluency list. """ clist = findClusters(fluency_lists, scheme, clustertype) avglists=[] for i in clist: avglist=[] for l in i: avglist.append(np.mean(l)) avglists.append(np.mean(avglist)) return avglists
[docs] def clusterSwitch(fluency_lists, scheme, clustertype='fluid', switchrate=False): """ Calculate the number of cluster switches in a fluency list (or list of fluency lists. Alternatively, calculate the switch rate (number of switches divided by list length). This function expects a list of lists. If you want to calculate the number of cluster switches in a single list, you can wrap it in another list, e.g., [fluency_list] Parameters ---------- fluency_lists : list A list of fluency lists, e.g., fluencydata.labeledlists scheme : str or int For semantic fluency data, specify a path indicating clustering scheme (.csv) to use. For letter fluency data, specify an in integer indicating the number of initial letters to use as clusters (e.g., 2) clustertype : str, optional Type of clustering to apply. Default is 'fluid'. The other option is 'static'. switchrate : bool, optional If True, returns the switch rate instead of switch count. Default is False. Returns ------- list of float A list containing the number of switches in each fluency list. """ clist = findClusters(fluency_lists, scheme, clustertype) avglists=[] for inum, i in enumerate(clist): avgnum=[] if len(i) > 0: if isinstance(i[0], list): for lstnum, lst in enumerate(i): switches = len(lst)-1 if switchrate: switches = switches / len(fluency_lists[inum][lstnum]) avgnum.append(switches) avglists.append(np.mean(avgnum)) else: switches = len(i)-1 if switchrate: switches = switches / len(fluency_lists[inum]) avglists.append(switches) else: avglists.append(0) return avglists
[docs] def findClusters(fluency_lists, scheme, clustertype='fluid'): """ Calculate the size of each cluster in a fluency list (or list of fluency lists) and return these cluster sizes as a list. For example, ['dog', 'cat', 'whale', 'shark'] might return [2, 2], as there are two clusters of size 2. This function is used internally by snafu.clusterSize and snafu.clusterSwitch. Parameters ---------- fluency_lists : list A list of fluency lists, e.g., fluencydata.labeledlists scheme : str or int For semantic fluency data, specify a path indicating clustering scheme (.csv) to use. For letter fluency data, specify an in integer indicating the number of initial letters to use as clusters (e.g., 2) clustertype : str, optional Type of clustering to apply. Default is 'fluid'. The other option is 'static'. Returns ------- list A list of cluster sizes (or nested list of cluster sizes). """ if len(fluency_lists) > 0: if isinstance(fluency_lists[0], list): clusters=fluency_lists else: clusters=labelClusters(fluency_lists, scheme) else: clusters=[] csize=[] curcats=set([]) runlen=0 clustList=[] firstitem=1 for inum, item in enumerate(clusters): if isinstance(item, list): clustList.append(findClusters(item, scheme, clustertype=clustertype)) else: newcats=set(item.split(';')) if newcats.isdisjoint(curcats) and firstitem != 1: # end of cluster, append cluster length csize.append(runlen) runlen = 1 else: # shared cluster or start of list runlen += 1 if clustertype=="fluid": curcats = newcats elif clustertype=="static": curcats = (curcats & newcats) if curcats==set([]): curcats = newcats else: raise ValueError('Invalid cluster type') firstitem=0 csize.append(runlen) if sum(csize) > 0: clustList += csize return clustList
[docs] def labelClusters(fluency_lists, scheme, labelIntrusions=False, targetLetter=None): """ Replace each item in a fluency list (or list of fluency lists) with its category or categories. For example, ['dog', 'cat', 'whale', 'shark'] might return ['canine;pets', 'pets', 'fish;water', 'fish;water']. This function is used internally by snafu.findClusters. Parameters ---------- fluency_lists : list A list of fluency lists, e.g., fluencydata.labeledlists scheme : str or int For semantic fluency data, specify a path indicating clustering scheme (.csv) to use. For letter fluency data, specify an in integer indicating the number of initial letters to use as clusters (e.g., 2) labelIntrusions : bool, optional When False, intrusions are silently omitted (as if they do not exist). When True, intrusions are replaced with the pseudo-category label 'intrusion'. Default is False. targetLetter : str, optional For letter fluency data, identifies the target letter. This is necessary only to identify intrusions (when labelIntrusions is set to True), otherwise it has no effect. Default is None. Returns ------- list A list (or nested list) of categoriesed corresponding to each item. """ ... if isinstance(scheme,str): clustertype = "semantic" # reads clusters from a fixed file elif isinstance(scheme,int): clustertype = "letter" # if an int is given, use the first N letters as a clustering scheme maxletters = scheme if targetLetter: targetLetter = targetLetter.lower() else: raise Exception('Unknown clustering type in labelClusters()') if clustertype == "semantic": cf=open(scheme,'rt', encoding='utf-8-sig') cats={} for line in cf: line=line.rstrip() if line[0] == "#": continue # skip commented lines cat, item = line.split(',') cat=cat.lower().replace(' ','').replace("'","").replace("-","") # basic clean-up item=item.lower().replace(' ','').replace("'","").replace("-","") if item not in list(cats.keys()): cats[item]=cat else: if cat not in cats[item]: cats[item]=cats[item] + ';' + cat labels=[] for inum, item in enumerate(fluency_lists): if isinstance(item, list): labels.append(labelClusters(item, scheme, labelIntrusions=labelIntrusions, targetLetter=targetLetter)) else: item=item.lower().replace(' ','') if clustertype == "semantic": if item in list(cats.keys()): labels.append(cats[item]) elif labelIntrusions: # if item not in dict, either ignore it or label is as category "intrusion" labels.append("intrusion") elif clustertype == "letter": if (item[0] == targetLetter) or ((targetLetter == None) and (labelIntrusions == False)): labels.append(item[:maxletters]) elif labelIntrusions: if targetLetter == None: raise Exception('Cant label intrusions without a target letter [labelClusters]') else: labels.append("intrusion") # if item not in dict, either ignore it or label is as category "intrusion" return labels