################################################################################3 # SQL script to create the table necessary for dealing with aliases in DBLP # I guess some parts can be done with less temporary tables, but what I tried was just too slow... # # outcome: three tables: # 1) dblp_aliases_new: table with mappings between all aliases to get the aliases for one given name # usage: select authorAliases from dblp_aliases_new where author="" # 2) dblp_main_aliases_new: table with info about all aliases (in one field) plus additional author info (publication count, avg pub_count...) # usage: select all_aliases, pub_count, avg_pub_count, min_year, max_year from dblp_main_aliases_new where author_id="" # 3) dblp_authorid_ref_new: table similar to dblp_author_ref_new, but with additional author_ids which can be used to resolve aliases + pub_count # create a table with unique ids per author drop table if exists author_ids; CREATE TABLE author_ids ( author_id INT(11) NOT NULL AUTO_INCREMENT, author VARCHAR(150) NOT NULL, pub_count INT(5) NOT NULL, primary key(author_id)) charset = utf8; insert into author_ids (author, pub_count) select author, count(*) from dblp_author_ref_new a, dblp_pub_new p where a.id=p.id and p.dblp_key not like 'homepages/%' group by author; # create another table but without the primary key (which disallows aliases ids...) drop table if exists author_ids2; create table author_ids2 as select * from author_ids; create index authors_ids2_id on author_ids2 (author_id); create index authors_ids2_name on author_ids2 (author); # some clean up drop table if exists author_ids; # create a table with all pairs of aliases drop table if exists dblp_aliases_new; create table dblp_aliases_new as select a1.author AS author, a2.author AS authorAlias, p.id AS author_id from dblp_author_ref_new a2 join dblp_pub_new p join dblp_author_ref_new a1 on p.id = a1.id and a2.id = p.id where p.type = 'www' and p.dblp_key like 'homepages/%' and a2.author <> a1.author; create index aliases_name on dblp_aliases_new (author); # now change the ids for known aliases set @max =(select max(author_id) from author_ids2); update author_ids2 i, dblp_aliases_new a set i.author_id=@max+a.author_id where a.author=i.author; # create a table with one row per author only and sum up the publication counts drop table if exists dblp_main_aliases_new; create table dblp_main_aliases_new as select author, group_concat(author separator ' / ') as all_aliases, max(author_id) as author_id , sum(pub_count) as pub_count, 1 as avg_pub_count, 1 as start_year, 1 as end_year, 1 as num_sources from author_ids2 a group by a.author_id; create index main_aliases_name on dblp_main_aliases_new (author); create index main_aliases_id on dblp_main_aliases_new (author_id); create fulltext index main_aliases_author_full on dblp_main_aliases_new(author); create fulltext index main_aliasesall_author_full on dblp_main_aliases_new(all_aliases); # add the sum of the publication count to the aliases of the author_ids2 table update author_ids2 i, dblp_main_aliases_new a set i.pub_count=a.pub_count where i.author_id=a.author_id; ##### the homepage management alter table dblp_main_aliases_new add column (hp_dblp_key VARCHAR(150), hp_ee VARCHAR(200), hp_publisher VARCHAR(200)); alter table dblp_main_aliases_new charset=utf8; #update dblp_pub_new p, dblp_author_ref_new au, dblp_main_aliases_new a set a.hp_dblp_key=p.dblp_key, a.hp_ee=p.ee, a.hp_publisher=p.publisher, a.pub_count=a.pub_count-1-() where p.type='www' and p.dblp_key like 'homepages/%' and p.id=au.id and au.author=a.author; update dblp_pub_new p, dblp_author_ref_new au, dblp_main_aliases_new a set a.hp_dblp_key=p.dblp_key, a.hp_ee=p.ee, a.hp_publisher=p.publisher where p.type='www' and p.dblp_key like 'homepages/%' and p.id=au.id and au.author=a.author; # now drop the homepages from dblp_pub_new #delete dblp_author_ref_new a from dblp_author_ref_new a, dblp_pub_new p where p.id=a.id and type='www' and dblp_key like 'homepages/%'; #delete dblp_ref_new a from dblp_ref_new a, dblp_pub_new p where p.id=a.id and type='www' and dblp_key like 'homepages/%'; delete from dblp_pub_new where type='www' and dblp_key like 'homepages/%'; ########################################################################### # create a table similar to the dblp_author_ref_new but with the author id and the publication counter from dblp_main_aliases_new... # we do not include in dblp_author_ref_new directly as that one is exported to the public when dumping the database # -> we might change that later drop table if exists dblp_authorid_ref_new; create table dblp_authorid_ref_new select a.id, a.author, p.author_id as author_id, pub_count from dblp_author_ref_new a join author_ids2 p on a.author=p.author; create index aid_id on dblp_authorid_ref_new (id); create index aid_authors on dblp_authorid_ref_new (author); create index aid_author_id on dblp_authorid_ref_new (author_id); drop table author_ids2; # we want to get rid of people having published in to few sources. Most of them have written regular columns # for journals such as news, editors message etc. # --> update dblp_main_alises with the number of sources # group the authors by the number of sources they have published in drop table if exists foo; # group the publication of all prolific authors by the publication source create temporary table foo select a.author_id, count(distinct source) as co from dblp_authorid_ref_new a join dblp_pub_new p on a.id=p.id where source is not null group by a.author_id; update dblp_main_aliases_new a, foo b set a.num_sources=b.co where a.author_id=b.author_id; ###################################################################################################### # update the yearly averages # some authors have a disadvantage here because selected early papers are in DBLP, but not the rest... # use continuous paper ranges only? drop table if exists bar; create table bar select a.author_id, min(p.year) as start_year, max(p.year) as end_year, round(pub_count / (max(p.year)-min(p.year)+1)) as avg_pub_count from dblp_authorid_ref_new a join dblp_pub_new p on a.id=p.id where year > 0 group by author_id; create index bar2 on bar (author_id); update dblp_main_aliases_new a, bar b set a.avg_pub_count=b.avg_pub_count, a.start_year=b.start_year, a.end_year=b.end_year where a.author_id=b.author_id; drop table bar;