Monday, April 30, 2018

Python code to extract Title, Author, and Dewey from KOHA Database


sudo apt-get install python-bs4
sudo apt-get install python-mysqldb


Extract MARC data and insert it into new table

1) create the next table

DROP TABLE IF EXISTS `newmarcrecords`;
CREATE TABLE `NewMarcRecords` (
    `BibID` VARCHAR(30) NOT NULL ,
    `Leader` VARCHAR(30)  NULL ,
    `Title` VARCHAR(500) NOT NULL ,
    `Auther` VARCHAR(500)  NULL ,
    `Publisher` VARCHAR(500)  NULL ,
   
    `PublishYear` VARCHAR(500)  NULL ,
    `PublishLocation` VARCHAR(500)  NULL ,
    `Subject` VARCHAR(500)  NULL ,
    `Classification` VARCHAR(500)  NULL ,

    `RecordSource` VARCHAR(500)  NULL ,
    `Pages` VARCHAR(500)  NULL ,
    `URL` VARCHAR(500)  NULL ,
    `CoverURL` VARCHAR(500)  NULL ,
    `Price` VARCHAR(30) NULL ,
    `RecordStatus` VARCHAR(1) NULL ,
     PRIMARY KEY (`BibID`)) DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci ENGINE = InnoDB;


2) run the next Python code

# -*- coding: utf-8 -*-
#!/usr/bin/python

"""
DROP TABLE IF EXISTS `newmarcrecords`;
CREATE TABLE `NewMarcRecords` (
    `BibID` VARCHAR(30) NOT NULL ,
    `Leader` VARCHAR(30)  NULL ,
    `Title` VARCHAR(500) NOT NULL ,
    `Auther` VARCHAR(500)  NULL ,
    `Publisher` VARCHAR(500)  NULL ,
    `PublishYear` VARCHAR(500)  NULL ,
    `PublishLocation` VARCHAR(500)  NULL ,
    `Subject` VARCHAR(500)  NULL ,
    `Classification` VARCHAR(500)  NULL ,
    `RecordSource` VARCHAR(500)  NULL ,
    `Pages` VARCHAR(500)  NULL ,
    `URL` VARCHAR(500)  NULL ,
    `CoverURL` VARCHAR(500)  NULL ,
    `Price` VARCHAR(30) NULL ,
    `RecordStatus` VARCHAR(1) NULL ,
     PRIMARY KEY (`BibID`)) DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci ENGINE = InnoDB;
"""

import MySQLdb
from bs4 import BeautifulSoup

def GetValue( Tag,Feild ):
    check=y.find(tag=Tag)
    if check is not None:
        check = check.find(code=Feild)
        if check is not None:
            return check.get_text()
    return '---'



db = MySQLdb.connect(host="localhost",    # your host, usually localhost
                     user="root",         # your username
                     passwd="admin123",   # your password
                     db="koha" ,          # name of the data base
                     charset='utf8')         

# you must create a Cursor object. It will let
#  you execute all the queries you need
cur = db.cursor()

# Use all the SQL you like
cur.execute("SELECT biblionumber,metadata FROM `biblio_metadata` where CAST(biblionumber AS CHAR) not in (select BibID from NewMarcRecords)")

# print all the first cell of all the rows
for row in cur.fetchall():
    biblionumber= row[0]
    xmlMarc = row[1]
    y=BeautifulSoup(xmlMarc,"html5lib")
    Leader=y.record.leader.get_text()
    title=GetValue( "245","a")
    author=GetValue( "100","a")
    Publisher=GetValue( "260","a")
    PublishYear=GetValue( "260","c")
    PublishLocation=GetValue( "260","a")
    Subject=GetValue( "650","a")
    Classification=GetValue( "082","a")
    RecordSource=GetValue( "956","s")
    Pages=GetValue( "300","a")
    URL=GetValue( "856","u")
    CoverURL=GetValue( "956","u")
    Price=GetValue( "956","p")
    RecordStatus='0'

    try:
        cur.execute("""
                    INSERT INTO `newmarcrecords`  (`BibID`, `Leader`, `Title`, `Auther`, `Publisher`, `PublishYear`, `PublishLocation`
                    , `Subject`, `Classification`, `RecordSource`, `Pages`, `URL`, `CoverURL`, `Price`, `RecordStatus`)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                    """
                    ,(biblionumber,Leader,title,author,Publisher,PublishYear,PublishLocation
                    ,Subject,Classification,RecordSource,Pages,URL,CoverURL,Price,RecordStatus))
        db.commit()
        print "success"
    except ValueError as e:
        db.rollback()
        print "Fail!"
        print e.strerror


    print title
    print "========================="

db.close()


1 comment:

best ed pills said...

I don't know whether it's just me or if perhaps everyone else encountering problems with your website. It appears like some of the written text in your posts are running off the screen. Can somebody else please provide feedback and let me know if this is happening to them as well? This could be a problem with my browser because I've had this happen before. Appreciate it