Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training.
Resource URI: https://dblp.l3s.de/d2r/resource/publications/journals/corr/abs-2401-05566
Home
|
Example Publications
Property
Value
dcterms:
bibliographicCitation
<
http://dblp.uni-trier.de/rec/bibtex/journals/corr/abs-2401-05566
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Adam_S._Jermyn
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Amanda_Askell
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Ansh_Radhakrishnan
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Buck_Shlegeris
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Carson_Denison
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Cem_Anil
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Daniel_M._Ziegler
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/David_Duvenaud
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Deep_Ganguli
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Ethan_Perez
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Evan_Hubinger
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Fazl_Barez
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Holden_Karnofsky
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jack_Clark
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jan_Brauner
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jared_Kaplan
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jesse_Mu
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Kamal_Ndousse
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Kshitij_Sachan
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Logan_Graham
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Marina_Favaro
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Meg_Tong
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Michael_Sellitto
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Mike_Lambert
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Monte_MacDiarmid
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Mrinank_Sharma
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Newton_Cheng
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Schiefer
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nova_DasSarma
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Paul_F._Christiano
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Roger_Grosse
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Ryan_Greenblatt
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/S%E2%88%9A%E2%88%82ren_Mindermann
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Samuel_R._Bowman
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Shauna_Kravec
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tamera_Lanham
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tim_Maxwell
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Yuntao_Bai
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Zachary_Witten
>
foaf:
homepage
<
http://dx.doi.org/doi.org%2F10.48550%2FarXiv.2401.05566
>
foaf:
homepage
<
https://doi.org/10.48550/arXiv.2401.05566
>
dc:
identifier
DBLP journals/corr/abs-2401-05566
(xsd:string)
dc:
identifier
DOI doi.org%2F10.48550%2FarXiv.2401.05566
(xsd:string)
dcterms:
issued
2024
(xsd:gYear)
swrc:
journal
<
https://dblp.l3s.de/d2r/resource/journals/corr
>
rdfs:
label
Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training.
(xsd:string)
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Adam_S._Jermyn
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Amanda_Askell
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Ansh_Radhakrishnan
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Buck_Shlegeris
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Carson_Denison
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Cem_Anil
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Daniel_M._Ziegler
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/David_Duvenaud
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Deep_Ganguli
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Ethan_Perez
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Evan_Hubinger
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Fazl_Barez
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Holden_Karnofsky
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jack_Clark
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jan_Brauner
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jared_Kaplan
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jesse_Mu
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Kamal_Ndousse
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Kshitij_Sachan
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Logan_Graham
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Marina_Favaro
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Meg_Tong
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Michael_Sellitto
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Mike_Lambert
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Monte_MacDiarmid
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Mrinank_Sharma
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Newton_Cheng
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Schiefer
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nova_DasSarma
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Paul_F._Christiano
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Roger_Grosse
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Ryan_Greenblatt
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/S%E2%88%9A%E2%88%82ren_Mindermann
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Samuel_R._Bowman
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Shauna_Kravec
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tamera_Lanham
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tim_Maxwell
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Yuntao_Bai
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Zachary_Witten
>
owl:
sameAs
<
http://bibsonomy.org/uri/bibtexkey/journals/corr/abs-2401-05566/dblp
>
owl:
sameAs
<
http://dblp.rkbexplorer.com/id/journals/corr/abs-2401-05566
>
rdfs:
seeAlso
<
http://dblp.uni-trier.de/db/journals/corr/corr2401.html#abs-2401-05566
>
rdfs:
seeAlso
<
https://doi.org/10.48550/arXiv.2401.05566
>
dc:
title
Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training.
(xsd:string)
dc:
type
<
http://purl.org/dc/dcmitype/Text
>
rdf:
type
swrc:Article
rdf:
type
foaf:Document
swrc:
volume
abs/2401.05566
(xsd:string)