3 # Copyright (C) 2022 MOBIUS
4 # Author: Blake Graham-Henderson <blake@mobiusconsortium.org>
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 # ---------------------------------------------------------------
21 use File::Path qw(make_path);
25 our $xmlconf = "/openils/conf/opensrf.xml";
32 our $seedTableUsedRowCount = 0;
33 our $seedTableRowCount = 100000;
39 'actor.workstation_setting',
42 'asset.copy_vis_attr_cache',
43 'authority.rec_descriptor',
44 'authority.simple_heading',
46 'authority.authority_linking',
48 'actor.org_unit_proximity',
49 'config.org_unit_setting_type_log',
50 'config.xml_transform',
51 'money.materialized_billable_xact_summary',
52 'serial.materialized_holding_code',
53 'vandelay.queued_bib_record_attr',
54 'config.print_template',
55 'config.workstation_setting_type',
64 # call number data can include ##URI##,
65 # which biblio.record_entry triggers create, so, they go first
68 'asset.uri_call_number_map',
69 'biblio.record_entry',
70 'biblio.monograph_part',
74 'acq.lineitem_detail',
77 'asset.copy_location',
80 'authority.record_entry',
82 'money.billable_xact',
85 # needs to come before actor.workstation
86 'money.bnm_desk_payment',
89 our $help = "Usage: ./make_concerto_from_evergreen_db.pl [OPTION]...
91 This program automates the process of making a new dataset for the Evergreen code repository.
92 We need connection details to the Evergreen database where the intended dataset lives AND
93 we need connection details to a stock Evergreen database that only has the seed data.
94 This code requires the second database for comparison reasons. It needs to know what data is
95 seed data and what data is not.
98 --output-folder Folder for our generated output
99 --xmlseed path to Evergreen opensrf.xml file for DB connection details to the seed database, created with --create-database --create-schema, NOT --load-all-sample
102 --xmlconfig path to Evergreen opensrf.xml file for DB connection details, default /openils/conf/opensrf.xml
103 --sample Number of rows to fetch eg --sample 100 (not implemented)
104 --debug Set debug mode for more verbose output.
108 "sample=s" => \$sample,
109 "xmlconfig=s" => \$xmlconf,
110 "xmlseed=s" => \$xmlconfseed,
111 "output-folder=s" => \$outputFolder,
123 # make the output folder if it doesn't exist
129 ) if ( !( -e $outputFolder ) );
131 # Gather a list of Evergreen Tables to process
132 my @evergreenTables = @{ getSchemaTables() };
134 "BEGIN;\n\n-- stop on error\n\\set ON_ERROR_STOP on\n\n"
135 . "-- Ignore constraints until we're done\nSET CONSTRAINTS ALL DEFERRED;\n\n";
137 while ( $#evergreenTables > -1 ) {
138 my $thisTable = shift @evergreenTables;
139 my $columnRef = shift @evergreenTables;
140 if ( checkTableForInclusion($thisTable) ) {
141 my $thisFile = $outputFolder . "/$thisTable.sql";
142 print "Processing $thisTable > $thisFile\n";
143 unlink $thisFile if -e $thisFile;
145 open( $thisFhandle, '>> ' . $thisFile );
146 binmode( $thisFhandle, ":utf8" );
147 my $lines = tableHandler( $thisTable, $columnRef, $thisFhandle );
149 unlink $thisFile if ( -e $thisFile && $lines == 0 );
150 push( @loadTables, $thisTable ) if ( -e $thisFile && $lines > 0 );
154 print "Skipping: $thisTable\n" if $debug;
157 $loadAll = loadTableOrderMaker( $loadAll, \@loadTables );
158 $loadAll .= "COMMIT;\n";
159 print "Writing loader > $outputFolder/load_all.sql\n";
160 open( OUT, "> $outputFolder/load_all.sql" );
161 binmode( OUT, ":utf8" );
166 sub loadTableOrderMaker {
167 my $loadString = shift;
168 my $includedTablesRef = shift;
169 my @includedTables = @{$includedTablesRef};
172 # Loop through the pre-defined order, and check those off
173 foreach (@loadOrder) {
176 foreach (@includedTables) {
177 if ( $includedTables[$pos] eq $otable ) {
178 $loadString .= makeLoaderLine($_);
188 foreach (@includedTables) {
189 if ( not defined $used{$pos} ) {
190 $loadString .= makeLoaderLine($_);
202 my $ret = "\\echo loading $table\n";
203 $ret .= "\\i $table.sql\n\n";
209 my $tableColumnRef = shift;
211 my $funcHandler = $table;
213 $funcHandler =~ s/\./_/g;
214 $funcHandler .= '_handler';
216 # if some tables need handled special, make a sub with the table name AKA sub biblio_record_entry_handler
217 if ( functionExists($funcHandler) ) {
221 . '($table, $tableColumnRef, $fHandle);';
225 $rowCount = standardHandler( $table, $tableColumnRef, $fHandle );
232 my %columns = %{$colRef};
236 while ( ( my $colname, my $colpos ) = each(%columns) ) {
237 push( @order, $colpos );
239 @order = sort { $a <=> $b } @order;
242 while ( ( my $colname, my $colpos ) = each(%columns) ) {
243 if ( $colpos == $thisPOS ) {
244 push( @ret, $colname );
255 my $funcname = shift;
256 return \&{$funcname} if defined &{$funcname};
263 $query .= "\nLIMIT $CHUNKSIZE OFFSET $offset\n";
264 print $query if $debug;
265 my @results = @{ dbhandler_query($query) };
269 sub standardHandler {
271 my $tableColumnRef = shift;
273 my $omitColumnsRef = shift;
274 my %omitColumn = %{$omitColumnsRef} if $omitColumnsRef;
275 my $query = "SELECT ";
276 my $sqlOutTop = "COPY $table (";
277 my $order = "ORDER BY ";
278 my @colOrder = @{ columnOrder($tableColumnRef) };
282 foreach (@colOrder) {
284 # if the calling code wants to remove some columns, we skip them here
285 if ( ( !$omitColumnsRef ) || ( not defined $omitColumn{$_} ) ) {
287 $sqlOutTop .= "$_, ";
288 $order .= "$colCount, ";
292 print "removing column: $_\n";
295 $query = substr( $query, 0, -2 ); # remove the trailing comma+space
296 $sqlOutTop = substr( $sqlOutTop, 0, -2 ); # remove the trailing comma+space
297 $order = substr( $order, 0, -2 ); # remove the trailing comma+space
298 $query .= " FROM $table\n$order";
300 # makes it possible to not have to quote strings, dates, etc.
301 $sqlOutTop .= ") FROM stdin;\n";
304 my @data = @{ getDataChunk( $query, $offset ) };
306 while ( $#data > 0 ) #skipping column def metadata at the end of the array
308 my $sqlOut = $sqlOutTop;
309 my @differencesFromSeed =
310 @{ removeDuplicateStockData( \@data, $table, $firstTime ) };
313 foreach (@differencesFromSeed) {
317 foreach ( @{$row} ) {
318 $_ = '\N' if !defined $_;
320 # escape reserved tokens
321 $_ =~ s/\n/\\n/g; # newline
322 $_ =~ s/\r/\\r/g; # carriage return
323 $_ =~ s/\t/\\t/g; # tab
324 $_ =~ s/\v/\\v/g; # vertical tab
325 $_ =~ s/\f/\\f/g; # form feed
328 $sqlOut = substr( $sqlOut, 0, -1 );
331 print $fHandle $sqlOut if $outCount > 0;
333 # postgres sql syntax for finish of stdin
334 print $fHandle "\\.\n\n" if $outCount > 0;
338 $offset += $CHUNKSIZE;
339 @data = @{ getDataChunk( $query, $offset ) };
341 print $fHandle injectSequenceUpdate($table);
350 sub biblio_record_entry_handler {
352 my $tableColumnRef = shift;
354 my %omitColumns = ( 'vis_attr_vector' => 1 );
355 return standardHandler( $table, $tableColumnRef, $fHandle, \%omitColumns );
358 sub actor_workstation_handler {
360 my $tableColumnRef = shift;
363 standardHandler( $table, $tableColumnRef, $fHandle, \%omitColumns );
364 print $fHandle <<'splitter';
366 -- a case where the deleted workstation had payments
367 INSERT INTO actor.workstation(id,name,owning_lib)
368 SELECT missingworkstation.id, aou.shortname||FLOOR(RANDOM() * 100 + 1)::INT, 1
372 DISTINCT mbdp.cash_drawer AS id
374 money.bnm_desk_payment mbdp
375 LEFT JOIN actor.workstation aw ON (mbdp.cash_drawer = aw.id)
379 JOIN actor.org_unit aou ON (aou.id=1);
381 -- anonymize workstation names
384 SET name=aou.shortname||'-'||aw.id
385 FROM actor.org_unit aou
387 aou.id=aw.owning_lib;
393 sub injectSequenceUpdate {
395 my @schema = split( /\./, $table );
396 my $schemaName = @schema[0];
398 my $query = <<'splitter';
401 SELECT t.oid::regclass AS table_name,
402 a.attname AS column_name,
403 s.relname AS sequence_name
405 JOIN pg_attribute AS a
406 ON a.attrelid = t.oid
408 ON d.refobjid = t.oid
409 AND d.refobjsubid = a.attnum
412 WHERE d.classid = 'pg_catalog.pg_class'::regclass
413 AND d.refclassid = 'pg_catalog.pg_class'::regclass
414 AND d.deptype IN ('i', 'a')
415 AND t.relkind IN ('r', 'P')
419 a.table_name = '!!tbname!!'::regclass
422 $query =~ s/!!tbname!!/$table/g;
423 my @results = @{ dbhandler_query($query) };
424 while ( $#results > 0 ) {
425 my $this = shift @results;
427 my $colname = @row[1];
428 my $seqname = @row[2];
429 $ret .= "\\echo sequence update column: !!colname!!\n";
431 "SELECT SETVAL('!!seqname!!', (SELECT MAX(!!colname!!) FROM !!tbname!!));\n";
432 $ret =~ s/!!tbname!!/$table/g;
433 $ret =~ s/!!colname!!/$colname/g;
434 $ret =~ s/!!seqname!!/$schemaName.$seqname/g;
442 sub removeDuplicateStockData {
443 my $resultsRef = shift;
445 my $firstTime = shift;
446 $seedTableRowCount = getTableRowCount( $table, 1 ) if ($firstTime);
447 $seedTableUsedRowCount = 0 if ($firstTime);
449 my @results = @{$resultsRef};
450 my $colRef = @results[$#results];
451 my %columns = %{$colRef};
457 last if $resultsPOS == $#results;
460 # don't bother if we know we've already used up the seed data table
461 if ( $seedTableUsedRowCount < $seedTableRowCount ) {
462 my @row = @{$rowRef};
465 my $select = "SELECT ";
466 my $where = "WHERE 1=1";
467 while ( ( my $colname, my $colpos ) = each(%columns) ) {
469 # compare ID numbers when there is an ID column, otherwise, compare the rest of the columns
470 if ( ( $colname ne 'id' && not defined $columns{'id'} )
471 || ( $colname eq 'id' ) )
473 $select .= "$colname, ";
475 defined @row[$colpos]
476 ) # if it's null data, the SQL needs to be "is null", not "="
479 $where .= " AND $colname = \$$pos";
480 push( @vals, @row[$colpos] );
483 $where .= " AND $colname is null";
488 # remove the trailing comma+space
489 $select = substr( $select, 0, -2 );
490 $select .= "\nFROM $table\n$where\n";
491 print $select if $debug;
492 print Dumper( \@vals ) if $debug;
493 my @res = @{ dbhandler_query( $select, \@vals, 1 ) };
495 # seed data doesn't have a match, want this row for our new dataset
497 push( @ret, $rowRef );
502 # Each time we match seed data, we count. If the number of rows found equals the
503 # number of total rows, we don't need to keep checking back on the seed database
504 $seedTableUsedRowCount++;
513 # exhausted the seed database table rows, this data can just blindly get added to the
515 push( @ret, $rowRef );
519 # print Dumper(\@ret);
520 print "Removed $removeCount rows (exists in seed data)\n" if $removeCount;
524 sub getSchemaTables {
527 my $query = <<'splitter';
528 SELECT schemaname||'.'||tablename
529 FROM pg_catalog.pg_tables
531 schemaname NOT IN('pg_catalog','information_schema')
536 my @results = @{ dbhandler_query($query) };
541 if ( getTableRowCount( @row[0] ) > 0 ) {
542 push( @ret, lc @row[0] );
543 push( @ret, getTableColumnNames( @row[0] ) );
546 print "no rows in @row[0]\n" if $debug;
549 last if $#results == $resultPos; # ignore the column header metadata
557 sub getTableRowCount {
561 my $query = "SELECT count(*) FROM $table";
562 my @results = @{ dbhandler_query( $query, undef, $seed ) };
567 last; # ignore column header metadata
573 sub getTableColumnNames {
576 my $query = "SELECT * FROM $table LIMIT 1";
577 my @results = @{ dbhandler_query($query) };
583 my $openilsfile = shift;
584 my $xml = new XML::Simple;
585 my $data = $xml->XMLin($openilsfile);
588 $data->{default}->{apps}->{"open-ils.storage"}->{app_settings}
589 ->{databases}->{database}->{host};
590 $conf{"db"} = $data->{default}->{apps}->{"open-ils.storage"}->{app_settings}
591 ->{databases}->{database}->{db};
593 $data->{default}->{apps}->{"open-ils.storage"}->{app_settings}
594 ->{databases}->{database}->{user};
596 $data->{default}->{apps}->{"open-ils.storage"}->{app_settings}
597 ->{databases}->{database}->{pw};
599 $data->{default}->{apps}->{"open-ils.storage"}->{app_settings}
600 ->{databases}->{database}->{port};
605 sub checkTableForInclusion {
607 my @schema = split( /\./, $table );
608 foreach (@skipTables) {
609 return 0 if ( lc $table eq lc $_ );
611 my @thisSchema = split( /\./, $_ );
612 return 0 if ( lc @schema[0] eq lc @thisSchema[0] );
618 sub logfile_readFile {
624 #print "Attempting open\n";
626 my $worked = open( inputfile, '< ' . $file );
628 print "******************Failed to read file*************\n";
630 binmode( inputfile, ":utf8" );
631 while ( !( open( inputfile, '< ' . $file ) ) && $trys < 100 ) {
632 print "Trying again attempt $trys\n";
638 #print "Finally worked... now reading\n";
639 @lines = <inputfile>;
643 print "Attempted $trys times. COULD NOT READ FILE: $file\n";
648 print "File does not exist: $file\n";
653 sub dbhandler_setupConnection {
661 $dbHandlerSeed = DBI->connect(
662 "DBI:Pg:dbname=$dbname;host=$host;port=$port",
666 post_connect_sql => "SET CLIENT_ENCODING TO 'UTF8'",
672 $dbHandler = DBI->connect(
673 "DBI:Pg:dbname=$dbname;host=$host;port=$port",
677 post_connect_sql => "SET CLIENT_ENCODING TO 'UTF8'",
684 sub dbhandler_query {
685 my $querystring = shift;
686 my $valuesRef = shift;
688 my @values = $valuesRef ? @{$valuesRef} : ();
692 $query = $dbHandler->prepare($querystring) if ( !$seed );
693 $query = $dbHandlerSeed->prepare($querystring) if ($seed);
696 $query->bind_param( $i, $_ );
700 my @columnNames = @{ $query->{NAME} };
703 foreach (@columnNames) {
709 while ( my $row = $query->fetchrow_arrayref() ) {
711 foreach ( @{$row} ) {
713 if ( ref $thisCol eq 'ARRAY' ) # handle [] datatypes
715 my $t = join( ',', @{$thisCol} );
716 if ( isStringArray($thisCol) == 1 ) {
717 $t = join( "','", @{$thisCol} );
719 push( @pushData, "{$t}" );
723 push( @pushData, $thisCol );
727 push( @ret, \@pushData );
730 push( @ret, \%colPos );
736 my $arrayRef = shift;
737 my @array = @{$arrayRef};
739 if ( $_ =~ m/[^\-^0-9^\.]/g ) {
747 my %dbconf = %{ getDBconnects($xmlconf) };
748 dbhandler_setupConnection(
749 $dbconf{"db"}, $dbconf{"dbhost"}, $dbconf{"dbuser"},
750 $dbconf{"dbpass"}, $dbconf{"port"}
753 %dbconf = %{ getDBconnects($xmlconfseed) };
754 dbhandler_setupConnection( $dbconf{"db"}, $dbconf{"dbhost"},
755 $dbconf{"dbuser"}, $dbconf{"dbpass"}, $dbconf{"port"}, 1 );
759 print "Checking command line arguments...\n" if ($debug);
761 if ( $outputFolder eq '' ) {
763 "Output folder not provided. Please pass in a command line path argument with --output-folder\n";
766 if ( !$xmlconfseed ) {
768 "Please provide a path to the Evergreen seed database conneciton details. Please pass in a command line path argument with --xmlseed\n";
772 if ( !-e $xmlconf ) {
774 "$xmlconf does not exist.\nEvergreen database xml configuration file does not exist. Please provide a path to the Evergreen opensrf.xml database conneciton details. --xmlconf\n";
778 if ( !-e $xmlconfseed ) {
780 "$xmlconfseed does not exist.\nEvergreen seed database xml configuration file does not exist. Please provide a path to the seed Evergreen opensrf.xml database conneciton details. --xmlconfseed\n";
784 # Trim any trailing / on path
785 $outputFolder =~ s/\/$//g;