Transformer

public struct Transformer<Element, Device> : LayerType, Codable where Element : RandomizableType, Device : DeviceType

Transformer as introduced by Attention Is All You Need.

The transformer model shares an embedding matrix between the encoder and decoder and reuses the embedding weights to compute the decoder output distribution. Outputs of the transformer are normalized using log softmax.


                    
                    
                    Outputs

Declaration

Swift

public typealias Outputs = Tensor<Element, Device>


                    
                    
                    embedding

Undocumented

Declaration

Swift

public var embedding: Embedding<Element, Device>


                    
                    
                    positionalEncoding

Undocumented

Declaration

Swift

public var positionalEncoding: PositionalEncoding<Element, Device>


                    
                    
                    dropout

Undocumented

Declaration

Swift

public var dropout: Dropout<Element, Device>


                    
                    
                    encoder

Undocumented

Declaration

Swift

public var encoder: TransformerEncoder<Element, Device>


                    
                    
                    decoder

Undocumented

Declaration

Swift

public var decoder: TransformerDecoder<Element, Device>


                    
                    
                    outputBias

Undocumented

Declaration

Swift

public var outputBias: Tensor<Element, Device>


                    
                    
                    parameters

Declaration

Swift

public var parameters: [Tensor<Element, Device>] { get }


                    
                    
                    parameterPaths

Declaration

Swift

public var parameterPaths: [WritableKeyPath<`Self`, Tensor<Element, Device>>] { get }


                    
                    
                    init(encoderLayers:decoderLayers:vocabSize:hiddenDim:heads:keyDim:valueDim:forwardDim:dropout:)

Creates a new transformer, which follows Attention Is All You Need.

Declaration

Swift

public init(encoderLayers: Int, decoderLayers: Int, vocabSize: Int, hiddenDim: Int, heads: Int, keyDim: Int, valueDim: Int, forwardDim: Int, dropout: Float = 0.1)

Parameters

`encoderLayers`	Number of encoder layers
`decoderLayers`	Number of decoder layers
`vocabSize`	Number of tokens in the vocabulary of the transformer
`hiddenDim`	Size of transformer layer outputs
`heads`	Number of attention heads in multi-head attention layers
`keyDim`	Size of key vectors in multi-head attention layers
`valueDim`	Size of value vectors in muti-head attention layers
`forwardDim`	Size of activations in poitnwise feed forward layers
`dropout`	Dropout rate


                    
                    
                    callAsFunction(_:)

Computes the outputs of the decoder given the inputs for the encoder and decoder.

Declaration

Swift

public func callAsFunction(_ inputs: (encoderInput: Tensor<Int32, Device>, decoderInput: Tensor<Int32, Device>, encoderInputLengths: [Int], decoderInputLengths: [Int])) -> Tensor<Element, Device>

Parameters


                                inputs

Tuple containing: - Padded encoder inputs using -1 as a padding token. - Padded decoder inputs using -1 as padding token.

Return Value

Batch of sequences of log-softmax normalized distributions over the vocabulary of the transformer with shape [batchSize, seqlen, vocabDim]


                    
                    
                    callAsFunction(inputSequence:startToken:endToken:maxLength:)

Greedily decodes the most probable sequence of output symbols given a sequence of input tokens

Declaration

Swift

public func callAsFunction(inputSequence: [Int32], startToken: Int32, endToken: Int32, maxLength: Int) -> [Int32]

Parameters

`inputSequence`	Input tokens
`startToken`	First token to feed into the decoder. Subsequent tokens are generated autoregressively.
`endToken`	Token, which ends decoding (end of sequence marker)
`maxLength`	Maximum length of the decoded sequence. If no endToken occurs after maxLength tokens, decoding is aborted.

Return Value

Most probable output sequence determined by greedy decoding.