Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned.
Resource URI: https://dblp.l3s.de/d2r/resource/publications/journals/corr/abs-2209-07858
Home
|
Example Publications
Property
Value
dcterms:
bibliographicCitation
<
http://dblp.uni-trier.de/rec/bibtex/journals/corr/abs-2209-07858
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Amanda_Askell
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Andy_Jones
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Anna_Chen
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Ben_Mann
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Catherine_Olsson
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Chris_Olah
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Danny_Hernandez
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Dario_Amodei
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Dawn_Drain
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Deep_Ganguli
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Eli_Tran-Johnson
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Ethan_Perez
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jack_Clark
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jackson_Kernion
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Jared_Kaplan
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Josh_Jacobson
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Kamal_Ndousse
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Liane_Lovitt
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nelson_Elhage
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Joseph
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Schiefer
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Nova_DasSarma
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Sam_Bowman
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Sam_McCandlish
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Sam_Ringer
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Saurav_Kadavath
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Scott_Johnston
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Shauna_Kravec
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Sheer_El_Showk
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Stanislav_Fort
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Brown
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Conerly
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Henighan
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Tristan_Hume
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Yuntao_Bai
>
dc:
creator
<
https://dblp.l3s.de/d2r/resource/authors/Zac_Hatfield-Dodds
>
foaf:
homepage
<
http://dx.doi.org/doi.org%2F10.48550%2FarXiv.2209.07858
>
foaf:
homepage
<
https://doi.org/10.48550/arXiv.2209.07858
>
dc:
identifier
DBLP journals/corr/abs-2209-07858
(xsd:string)
dc:
identifier
DOI doi.org%2F10.48550%2FarXiv.2209.07858
(xsd:string)
dcterms:
issued
2022
(xsd:gYear)
swrc:
journal
<
https://dblp.l3s.de/d2r/resource/journals/corr
>
rdfs:
label
Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned.
(xsd:string)
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Amanda_Askell
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Andy_Jones
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Anna_Chen
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Ben_Mann
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Catherine_Olsson
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Chris_Olah
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Danny_Hernandez
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Dario_Amodei
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Dawn_Drain
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Deep_Ganguli
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Eli_Tran-Johnson
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Ethan_Perez
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jack_Clark
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jackson_Kernion
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Jared_Kaplan
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Josh_Jacobson
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Kamal_Ndousse
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Liane_Lovitt
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nelson_Elhage
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Joseph
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nicholas_Schiefer
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Nova_DasSarma
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Sam_Bowman
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Sam_McCandlish
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Sam_Ringer
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Saurav_Kadavath
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Scott_Johnston
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Shauna_Kravec
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Sheer_El_Showk
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Stanislav_Fort
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Brown
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Conerly
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tom_Henighan
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Tristan_Hume
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Yuntao_Bai
>
foaf:
maker
<
https://dblp.l3s.de/d2r/resource/authors/Zac_Hatfield-Dodds
>
owl:
sameAs
<
http://bibsonomy.org/uri/bibtexkey/journals/corr/abs-2209-07858/dblp
>
owl:
sameAs
<
http://dblp.rkbexplorer.com/id/journals/corr/abs-2209-07858
>
rdfs:
seeAlso
<
http://dblp.uni-trier.de/db/journals/corr/corr2209.html#abs-2209-07858
>
rdfs:
seeAlso
<
https://doi.org/10.48550/arXiv.2209.07858
>
dc:
title
Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned.
(xsd:string)
dc:
type
<
http://purl.org/dc/dcmitype/Text
>
rdf:
type
swrc:Article
rdf:
type
foaf:Document
swrc:
volume
abs/2209.07858
(xsd:string)