juergenkemeter
Programmer
Hi!
I have to import all data, stored in an Excel workbook, into a PostgreSQL-database.
Herefore, I export Excel spreadsheets as .csv - files.
My actual Python script imports this csv-file, and generated PostgreSQL files (.psql), which contain the SQL statements for filling in the database with the Excel data.
My actual Problem:
The Excel workbook contains several spreadsheets. These are linked through Hyperlinks, and contain several cell comments.
How can I export these comments and Hyperlinks using Python? My aim is to store these in a corresponding PostgreSQL table.
If it helps, I can send the Excel workbook, and an example .csv-file, via email.
Here the Code of my actual Python script:
cheers,
Juergen
I have to import all data, stored in an Excel workbook, into a PostgreSQL-database.
Herefore, I export Excel spreadsheets as .csv - files.
My actual Python script imports this csv-file, and generated PostgreSQL files (.psql), which contain the SQL statements for filling in the database with the Excel data.
My actual Problem:
The Excel workbook contains several spreadsheets. These are linked through Hyperlinks, and contain several cell comments.
How can I export these comments and Hyperlinks using Python? My aim is to store these in a corresponding PostgreSQL table.
If it helps, I can send the Excel workbook, and an example .csv-file, via email.
Here the Code of my actual Python script:
Code:
#
# This script prepares a number of psql files to bootstrap data into the
# samson database , from the samples spreadsheet
#
#
# Gene Full Name Abbreviations and Alternative Names Order Cellular localisation Functional Classification Magnitude of Change Cycle Stage Follicular Localisation Human Chromosome Weighted Score Count
#
#
# 142 Unknown - 207 Unknown Unknown - - - - 30 2
# 143 Unknown - 207 Unknown Unknown - - - - 30 2
# A2B1 Heterogeneous nuclear ribonucleoproteins A2/B1 hnRNP A2 / hnRNP B1, HNRPA2B1 128 Nuclear Structural - - - 07p15 110 2
# ABC7 ATP-binding cassette, sub-family B (MDR/TAP), member 7 ABC7, ABCB7, ASAT, Atm1p 108 Cytoplasmic Transporter - - - Xq12-q13 125 2
# ACLY ATP citrate lyase ACLY 162 Cytoplasmic Lipid Metabolism - - - 17q12-q21 75 2
#
#
#Gene ---> geneticOb.geneticObName , geneticOb.obkeywords
#Full Name --> geneticOb.geneticObDescription
#Abbreviations.. --> geneticOb.geneticObSymbols, geneticOb.obkeywords
#order ---> geneticFact.AttributeName/Value
#Cellular localisation ----> geneticExpressionFact.cellularLocalisation
#Functional Classification ----> geneticFunctionFact.functionComment
#Magnitude of Change ---> geneticFact.AttributeName/Value
#FollicularLocalisation ----> geneticFact.Attribute/Value
#Chromosome -----> GeneticLocationFact.species ,.chromosome, cytopos
#WeightedScore -----> GeneticFact.attribute/Value
#Count ----> geneticFact.count
import csv
from sets import Set
infilename = "../data/SamsonGenes.csv";
def main():
genGeneBootstrap()
def writeHeader(outfilename):
outfile = open(outfilename,"w")
outfile.write("""
/*
* script generated by gengeneticlocusbootstrap.py
*/
""")
return outfile
def writeFooter(outfile):
outfile.write("""
/*
* **** done ****
*/
""")
outfile.close()
def genGeneBootstrap():
geneSQLDict={}
samsonSQLDict={}
expressionSQLDict={}
functionSQLDict={}
locationSQLDict={}
geneinfoSQLDict={}
reader = csv.reader(open(infilename, "rb"))
cols = ("Gene","Full Name","Abbreviations and Alternative Names",
"Order","Cellular localisation","Functional Classification",
"Magnitude of Change","Cycle Stage",
"Follicular Localisation","Human Chromosome",
"Weighted Score","Count")
coldict={}
ucols = [item.upper() for item in cols]
for row in reader:
#print row
#get column positions if necessary
if len(coldict) == 0 :
ucrow = [item.upper() for item in row]
#print ucrow
try :
for ucitem in ucols :
coldict[ucitem]=ucrow.index(ucitem)
except ValueError:
None
#construct SQL if we have the gene column
#GENE
#FULL NAME
#ABBREVIATIONS
#ORDER
#CELLULAR LOCALISATION
#FUNCTIONAL CLASSIFICATION
#MAGNITUDE OF CHANGE
#FOLLICULARLOCALISATION
#CHROMOSOME
#WEIGHTEDSCORE
#COUNT
if coldict.has_key('GENE'):
if (not geneSQLDict.has_key(row[coldict['GENE']])) and \
row[coldict['GENE']] != 'Gene' and \
len(row[coldict['GENE']].split()) > 0:
# geneticOb insert
print "adding ",row[coldict['GENE']]
geneSQLDict[row[coldict['GENE']]] = "insert into geneticOb(geneticobname,xreflsid,geneticobtype,geneticobdescription,geneticobsymbols,obkeywords) "
geneSQLDict[row[coldict['GENE']]] = geneSQLDict[row[coldict['GENE']]] + \
"values('" + row[coldict['GENE']] + "','[URL unfurl="true"]http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=search&term="[/URL] + \
row[coldict['GENE']] + "','Gene','" + row[coldict['FULL NAME']] + " (from SAMSON database)','" + row[coldict['GENE']] + " " + \
row[coldict['ABBREVIATIONS AND ALTERNATIVE NAMES']] + "','" + row[coldict['GENE']] + " " + row[coldict['ABBREVIATIONS AND ALTERNATIVE NAMES']] + " SAMSON');"
if len(row[coldict["COUNT"]].strip()) == 0:
row[coldict["COUNT"]] = "null"
#samsonGeneticFact insert
samsonSQLDict[row[coldict['GENE']]] = "insert into samsonGeneticFact(geneticob,genesummaryorder,"
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]] + \
"changemagnitude,follicularlocalisation,genesummaryweightedscore,genesummarycount) select obid," +\
row[coldict['ORDER']] + ",'" + row[coldict['MAGNITUDE OF CHANGE']] + "','" + \
row[coldict["FOLLICULAR LOCALISATION"]] + "'," + row[coldict["WEIGHTED SCORE"]] + "," + \
row[coldict["COUNT"]] + " from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]].replace(",,",",null,")
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]].replace(",#N/A,",",null,")
#GeneticExpressionFact insert
expressionSQLDict[row[coldict['GENE']]] = "insert into geneticExpressionFact(geneticob,cellularlocalisation,lifecyclestage) " + \
"select obid,'" + row[coldict['CELLULAR LOCALISATION']] + "','" + row[coldict['CYCLE STAGE']] + \
"' from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#GeneticFunctionFact insert
if row[coldict['FUNCTIONAL CLASSIFICATION']].strip().upper() != "UNKNOWN":
functionSQLDict[row[coldict['GENE']]] = "insert into geneticFunctionFact(geneticob,functionComment) " + \
"select obid,'" + row[coldict['FUNCTIONAL CLASSIFICATION']] + \
"' from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#GeneticLocationFact insert
cytopos=row[coldict['HUMAN CHROMOSOME']].strip()
if cytopos != "-" and len(cytopos) > 0 :
xreflsid= "'[URL unfurl="true"]http://genome.ucsc.edu/cgi-bin/hgTracks?org=Human&db=hg17&position="[/URL] + cytopos + "'"
cytopos="'" + cytopos + "'"
species="'Homo Sapiens'"
locationSQLDict[row[coldict['GENE']]] = "insert into geneticLocationFact(geneticob,xreflsid,cytopos,speciesname) " + \
"select obid," + xreflsid + "," +cytopos + "," + species + \
" from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#generate SQL that when executed on the Oracle database, will generate
#Postgres updates of the GeneticFunction and Location tables. Example Oracle SQL :
"""
select
/*+ RULE */
'insert into geneticfact(ob,factnamespace,attributename,attributevalue) ' ||
'select obid, ''EntrezGene'',''Summary'',''<dd>Geneid=' ||
to_char(geneid) || ' Symbol='||
symbol || ' Synonyms=' ||
synonyms || '<dd>Description='||
description || ' <dd>GOdescription=' ||
pubplsqlutils.getGoString(geneid) || ' <dd>Map=' ||
g.chromosome||':'||g.map_location || ''' from geneticob where obname = ''CRP1'''
from
pubstore.geneinfo g
where
(symbol='CRP1' or
synonyms like 'CRP1|' or
synonyms like '%|CRP1|%' or
synonyms like '%|CRP1') and taxid = 9606
#example pgsql :
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1053 Symbol=CEBPE Synonyms=C/EBP-epsilon|CRP1<dd>Description=CCAAT/enhancer binding protein (C/EBP), epsilon <dd>GOdescription=DNA binding ; defense response ; nucleus ; regulation of transcription, DNA-dependent ; transcription <dd>Map=14:14q11.2' from geneticob where obname = 'CRP1'
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1396 Symbol=CRIP1 Synonyms=CRHP|CRIP|CRP1<dd>Description=cysteine-rich protein 1 (intestinal) <dd>GOdescription=antimicrobial humoral response (sensu Vertebrata) ; cell proliferation ; cytoplasm ; metal ion binding ; zinc ion binding <dd>Map=14:14q32.33' from geneticob where obname = 'CRP1'
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1465 Symbol=CSRP1 Synonyms=CRP|CRP1|CSRP|CYRP|D1S181E|DKFZp686M148<dd>Description=cysteine and glycine-rich protein 1 <dd>GOdescription=metal ion binding ; nucleus ; zinc ion binding <dd>Map=1:1q32' from geneticob where obname = 'CRP1'
if cytopos != "-" and len(cytopos) > 0 : """
outfile=writeHeader("../bin/bootstrapgenes.psql")
# SQL to set up SAMSON project list of genes
outfile.write("insert into GeneticObList(listName,maxmembership,listComment) values('SAMSON Database of Hair Follicle Genes',25000,'SAMSON Gene Project List. This List contains GeneticOb objects in the SAMSON database project');\n");
# SQL to insert genes
for value in geneSQLDict.values():
outfile.write(value)
outfile.write('\n')
# SQL to populate projects list
outfile.write("insert into geneticOblistMembershipLink(geneticObList,geneticOb) \
select l.obid,g.obid from \
geneticOb g, geneticObList l \
where l.listName = 'SAMSON Database of Hair Follicle Genes' and \
g.geneticObDescription like '%from SAMSON database%'; \n")
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenesamson.psql")
for value in samsonSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgeneexpression.psql")
for value in expressionSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenefunction.psql")
for value in functionSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenelocation.psql")
for value in locationSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
print coldict
if __name__ == "__main__":
main()
cheers,
Juergen