Preview

# -*- coding: utf-8 -*-
"""
Created on Thu Apr  4 

@author: gregoire

Python 3.6.2
pandas 0.20.3
numpy 1.13.1

set the release folder as the cwd before running

prints a csv table summarizing the pvds plates, inkj plates, inkj samples
in an HTEJCAP database release using the 2 summary tables
plate.csv and composition.csv
"""

import pandas as pd
import numpy as np
import os

p=os.path.join(os.getcwd(),r'plate\plate.csv')

#df has a row for each plate, including pvds and inkj-based synthesis
df=pd.read_csv(p)


p2=os.path.join(os.getcwd(),r'plate\composition.csv')

#df2 has a row for each inkj (plate_id,sample_no), including reference samples
df2=pd.read_csv(p2)

#the columns of df2 include the ink loading, which after accounting for cation
#concentration in the ink, yields the concentration of each cation, *.PM.AtFrac
inkj_atfrac_keys=[k for k in sorted(df2.keys()) if k.endswith('.PM.AtFrac')]
carr=np.float32([np.float32(df2[k]) for k in inkj_atfrac_keys])
carr=carr.T
carr[np.isnan(carr)]=0.
#carr is n smps x n els where inkj_atfrac_keys are the els

#df contains the composition system, but individual inkj samples have their
#own composition systems, e.g. a subsystem of the plate's comp. space
#Here the comp. system is saved as a string similar to df['']
inkj_composition_system=np.array(\
                            ['-'.join([inkj_atfrac_keys[i].partition('.')[0] \
                                for i in np.where(ca>0.)[0]]) for ca in carr])



#to create a summary table first build a series of binary arrays (typically 
#int32) that for each plate indicates the type of print, specific anions, and
#number of cations
classif_d={}

print_class_keys=['pvds','inkj']
for pt in print_class_keys:
    classif_d['%s' %pt]=df['print_type']==pt

#6 anion classes, O,S,N,OS,ON,other (other is typically intended to be metals)
classif_d['OS']=df['contains_O']*df['contains_S']*(1.-df['contains_N'])
classif_d['ON']=df['contains_O']*df['contains_N']*(1.-df['contains_S'])
classif_d['O']=df['contains_O']*(1.-df['contains_S'])*(1.-df['contains_N'])
classif_d['S']=df['contains_S']*(1.-df['contains_O'])*(1.-df['contains_N'])
classif_d['N']=df['contains_N']*(1.-df['contains_O'])*(1.-df['contains_S'])
anion_class_keys=['O','S','N','OS','ON']
#other means not any of the above 5 classes
classif_d['other']=(1.-np.array([classif_d[k] for k in \
                        anion_class_keys])).prod(axis=0)
anion_class_keys+=['other']

#get number of anions to help get number of cations
classif_d['num_anions']=np.array([classif_d[k]*numan for k,numan in \
                zip(anion_class_keys,[1,1,1,2,2,0])]).sum(axis=0,dtype='int32')
classif_d['num_cations']=df['num_elements']-classif_d['num_anions']

#cations classified by the number
n_cation_vals=range(1,5)
cation_class_keys=[]
for nc in n_cation_vals:
    classif_d['%d' %nc]=classif_d['num_cations']==nc
    cation_class_keys+=['%d' %nc]
classif_d['>%d' %nc]=classif_d['num_cations']>nc
cation_class_keys+=['>%d' %nc]


#make subtables for pvds and inkj, which count plates, comp systems of plates
csvvals=[]
for printcount,pk in enumerate(print_class_keys):
    #table header lines
    csvvals+=[['','num. %s libraries' %pk,'','','','','']]
    csvvals+=[['num. cations in plate:']+cation_class_keys+['num. comp. sys.']]
      
    for ak in anion_class_keys:

        num_comp_sys_by_pk_ak=0
        for ck in cation_class_keys:
            #when the tuple of classifiers is the key, the array indicates 
            #whether the plate matches all 3 classificatons
            classif_d[(pk,ak,ck)]=np.array([classif_d[k] for k in (pk,ak,ck)])\
                                         .prod(axis=0,dtype='int32')
            #classif_d[(pk,ak,ck)]>0 create a boolean array which indexes the
            #composition systems, which are string with elements ordered so the
            #number of unique strings is the number of unique plate-level
            #composition systems
            num_comp_sys_by_pk_ak+=\
                  len(set(df['composition_system'][classif_d[(pk,ak,ck)]>0]))
        csvvals+=[[ak]+\
            ['%d' %classif_d[(pk,ak,ck)].sum() for ck in cation_class_keys]+\
            ['%d' %num_comp_sys_by_pk_ak]]

#the last of the 3 subtables is similar to above but at the sample level
#the pvds libraries are continuous oilibraries so every sample contains the
#full set of plate elements, but inkj samples can be subsytems of the plate els
pl='inkj'
csvvals+=[['','num. %s samples' %pk,'','','','','']]
csvvals+=[['num. cations in sample:']+cation_class_keys+\
                              ['num. unique element combinations']]
for ak in anion_class_keys:
    #get the plate_ids from the plate table that are inkj & in the anion class
    pids=df['plate_id'][np.array([classif_d[k] for k in (pk,ak)])\
                                     .prod(axis=0,dtype='bool')]
    #use plate ids to get the samples from the inkj table and create a flat
    #list of the composition system strings
    compsys_samples=[s for pid in pids \
                     for s in inkj_composition_system[df2['plate_id']==pid]]
    #s.count('-')+1 is a shortcut for number of cations 
    #(inkj table doesn't include anions added via annealing)
    n_cations_inkj_samples=np.array([s.count('-')+1 for s in compsys_samples])
    
    #now count the inkj samples in each number-of-cations class
    sample_counts_by_ck=[]
    for nc in n_cation_vals:
        sample_counts_by_ck+=[(n_cations_inkj_samples==nc).sum()]
    sample_counts_by_ck+=[(n_cations_inkj_samples>nc).sum()]
    
    csvvals+=[[ak]+\
        ['%d' %v for v in sample_counts_by_ck]+\
        ['%d' %len(set(compsys_samples))]]
        
print('\n'.join([','.join(l) for l in csvvals]))